Merge branch 'master' into streaming-aggregation

This commit is contained in:
Alexander Marshalov 2023-10-23 13:16:34 +02:00
commit 5bc3488538
No known key found for this signature in database
546 changed files with 18771 additions and 8304 deletions

View file

@ -6,7 +6,7 @@ body:
attributes:
value: |
Before filling a bug report it would be great to [upgrade](https://docs.victoriametrics.com/#how-to-upgrade)
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
and verify whether the bug is reproducible there.
It's also recommended to read the [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html) first.
- type: textarea

View file

@ -17,7 +17,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.21.1
go-version: 1.21.3
id: go
- name: Code checkout
uses: actions/checkout@master

View file

@ -57,7 +57,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: 1.21.1
go-version: 1.21.3
check-latest: true
cache: true
if: ${{ matrix.language == 'go' }}

View file

@ -32,7 +32,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: 1.21.1
go-version: 1.21.3
check-latest: true
cache: true
@ -56,7 +56,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: 1.21.1
go-version: 1.21.3
check-latest: true
cache: true
@ -81,7 +81,7 @@ jobs:
id: go
uses: actions/setup-go@v4
with:
go-version: 1.21.1
go-version: 1.21.3
check-latest: true
cache: true

View file

@ -6,6 +6,8 @@ on:
paths:
- 'docs/**'
workflow_dispatch: {}
env:
PAGEFIND_VERSION: "1.0.3"
permissions:
contents: read # This is required for actions/checkout and to commit back image update
deployments: write
@ -24,7 +26,16 @@ jobs:
repository: VictoriaMetrics/vmdocs
token: ${{ secrets.VM_BOT_GH_TOKEN }}
path: docs
- uses: peaceiris/actions-hugo@v2
with:
hugo-version: 'latest'
extended: true
- name: Install PageFind #install the static search engine for index build
uses: supplypike/setup-bin@v3
with:
uri: "https://github.com/CloudCannon/pagefind/releases/download/v${{env.PAGEFIND_VERSION}}/pagefind-v${{env.PAGEFIND_VERSION}}-x86_64-unknown-linux-musl.tar.gz"
name: "pagefind"
version: ${{env.PAGEFIND_VERSION}}
- name: Import GPG key
uses: crazy-max/ghaction-import-gpg@v5
with:
@ -45,6 +56,7 @@ jobs:
rm -rf content
cp -r ../main/docs content
make clean-after-copy
make build-search-index
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
git add .

10
.gitignore vendored
View file

@ -22,13 +22,3 @@ Gemfile.lock
/_site
_site
*.tmp
# vm binaries
app/victoria-metrics/victoria-metrics
app/vmagent/vmagent
app/vmctl/vmctl
app/vmalert/vmalert
# vm default data folder
app/victoria-metrics/victoria-metrics-data/
app/vmagent/vmagent-remotewrite-data/

View file

@ -28,7 +28,8 @@ all: \
vmauth-prod \
vmbackup-prod \
vmrestore-prod \
vmctl-prod
vmctl-prod \
vmalert-tool-prod
clean:
rm -rf bin/*
@ -40,7 +41,8 @@ publish: package-base \
publish-vmauth \
publish-vmbackup \
publish-vmrestore \
publish-vmctl
publish-vmctl \
publish-vmalert-tool
package: \
package-victoria-metrics \
@ -50,7 +52,8 @@ package: \
package-vmauth \
package-vmbackup \
package-vmrestore \
package-vmctl
package-vmctl \
package-vmalert-tool
vmutils: \
vmagent \
@ -58,7 +61,8 @@ vmutils: \
vmauth \
vmbackup \
vmrestore \
vmctl
vmctl \
vmalert-tool
vmutils-pure: \
vmagent-pure \
@ -66,7 +70,8 @@ vmutils-pure: \
vmauth-pure \
vmbackup-pure \
vmrestore-pure \
vmctl-pure
vmctl-pure \
vmalert-tool-pure
vmutils-linux-amd64: \
vmagent-linux-amd64 \
@ -74,7 +79,8 @@ vmutils-linux-amd64: \
vmauth-linux-amd64 \
vmbackup-linux-amd64 \
vmrestore-linux-amd64 \
vmctl-linux-amd64
vmctl-linux-amd64 \
vmalert-tool-linux-amd64
vmutils-linux-arm64: \
vmagent-linux-arm64 \
@ -82,7 +88,8 @@ vmutils-linux-arm64: \
vmauth-linux-arm64 \
vmbackup-linux-arm64 \
vmrestore-linux-arm64 \
vmctl-linux-arm64
vmctl-linux-arm64 \
vmalert-tool-linux-arm64
vmutils-linux-arm: \
vmagent-linux-arm \
@ -90,7 +97,8 @@ vmutils-linux-arm: \
vmauth-linux-arm \
vmbackup-linux-arm \
vmrestore-linux-arm \
vmctl-linux-arm
vmctl-linux-arm \
vmalert-tool-linux-arm
vmutils-linux-386: \
vmagent-linux-386 \
@ -98,7 +106,8 @@ vmutils-linux-386: \
vmauth-linux-386 \
vmbackup-linux-386 \
vmrestore-linux-386 \
vmctl-linux-386
vmctl-linux-386 \
vmalert-tool-linux-386
vmutils-linux-ppc64le: \
vmagent-linux-ppc64le \
@ -106,7 +115,8 @@ vmutils-linux-ppc64le: \
vmauth-linux-ppc64le \
vmbackup-linux-ppc64le \
vmrestore-linux-ppc64le \
vmctl-linux-ppc64le
vmctl-linux-ppc64le \
vmalert-tool-linux-ppc64le
vmutils-darwin-amd64: \
vmagent-darwin-amd64 \
@ -114,7 +124,8 @@ vmutils-darwin-amd64: \
vmauth-darwin-amd64 \
vmbackup-darwin-amd64 \
vmrestore-darwin-amd64 \
vmctl-darwin-amd64
vmctl-darwin-amd64 \
vmalert-tool-darwin-amd64
vmutils-darwin-arm64: \
vmagent-darwin-arm64 \
@ -122,7 +133,8 @@ vmutils-darwin-arm64: \
vmauth-darwin-arm64 \
vmbackup-darwin-arm64 \
vmrestore-darwin-arm64 \
vmctl-darwin-arm64
vmctl-darwin-arm64 \
vmalert-tool-darwin-arm64
vmutils-freebsd-amd64: \
vmagent-freebsd-amd64 \
@ -130,7 +142,8 @@ vmutils-freebsd-amd64: \
vmauth-freebsd-amd64 \
vmbackup-freebsd-amd64 \
vmrestore-freebsd-amd64 \
vmctl-freebsd-amd64
vmctl-freebsd-amd64 \
vmalert-tool-freebsd-amd64
vmutils-openbsd-amd64: \
vmagent-openbsd-amd64 \
@ -138,7 +151,8 @@ vmutils-openbsd-amd64: \
vmauth-openbsd-amd64 \
vmbackup-openbsd-amd64 \
vmrestore-openbsd-amd64 \
vmctl-openbsd-amd64
vmctl-openbsd-amd64 \
vmalert-tool-openbsd-amd64
vmutils-windows-amd64: \
vmagent-windows-amd64 \
@ -146,7 +160,8 @@ vmutils-windows-amd64: \
vmauth-windows-amd64 \
vmbackup-windows-amd64 \
vmrestore-windows-amd64 \
vmctl-windows-amd64
vmctl-windows-amd64 \
vmalert-tool-windows-amd64
victoria-metrics-crossbuild: \
victoria-metrics-linux-386 \
@ -342,7 +357,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
@ -351,6 +367,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
@ -358,6 +375,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
vmagent-$(GOOS)-$(GOARCH)-prod \
@ -365,7 +383,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
release-vmutils-windows-goarch: \
vmagent-windows-$(GOARCH)-prod \
@ -373,7 +392,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod \
vmbackup-windows-$(GOARCH)-prod \
vmrestore-windows-$(GOARCH)-prod \
vmctl-windows-$(GOARCH)-prod
vmctl-windows-$(GOARCH)-prod \
vmalert-tool-windows-$(GOARCH)-prod
cd bin && \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
@ -382,6 +402,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
@ -389,6 +410,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
vmagent-windows-$(GOARCH)-prod.exe \
@ -396,7 +418,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@ -514,3 +537,4 @@ docs-sync:
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs

201
README.md
View file

@ -13,7 +13,7 @@
VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
[Docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/), [Snap packages](https://snapcraft.io/victoriametrics)
and [source code](https://github.com/VictoriaMetrics/VictoriaMetrics).
@ -29,7 +29,7 @@ If you have questions about VictoriaMetrics, then feel free asking them at [Vict
[Contact us](mailto:info@victoriametrics.com) if you need enterprise support for VictoriaMetrics.
See [features available in enterprise package](https://docs.victoriametrics.com/enterprise.html).
Enterprise binaries can be downloaded and evaluated for free
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking the [CHANGELOG](https://docs.victoriametrics.com/CHANGELOG.html) and performing [regular upgrades](#how-to-upgrade-victoriametrics).
@ -137,7 +137,8 @@ See also [articles and slides about VictoriaMetrics from our users](https://docs
### Install
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
See also [QuickStart guide](https://docs.victoriametrics.com/Quick-Start.html) for additional information.
VictoriaMetrics can also be installed via these installation methods:
@ -214,8 +215,8 @@ After changes were made, trigger config re-read with the command `curl 127.0.0.1
### Running as Windows service
In order to run as a windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw)
and then install it as a service.
In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw)
and then install it as a service according to the following guide:
1. Create a service configuration:
@ -225,23 +226,23 @@ and then install it as a service.
<name>VictoriaMetrics</name>
<description>VictoriaMetrics</description>
<executable>%BASE%\victoria-metrics-windows-amd64-prod.exe"</executable>
<onfailure action="restart" delay="10 sec"/>
<onfailure action="restart" delay="20 sec"/>
<resetfailure>1 hour</resetfailure>
<arguments>-envflag.enable</arguments>
<priority>Normal</priority>
<stoptimeout>15 sec</stoptimeout>
<stopparentprocessfirst>true</stopparentprocessfirst>
<startmode>Automatic</startmode>
<waithint>15 sec</waithint>
<sleeptime>1 sec</sleeptime>
<logpath>%BASE%\logs</logpath>
<log mode="roll">
<sizeThreshold>10240</sizeThreshold>
@ -251,7 +252,7 @@ and then install it as a service.
<env name="loggerFormat" value="json" />
<env name="loggerOutput" value="stderr" />
<env name="promscrape_config" value="C:\Program Files\victoria-metrics\promscrape.yml" />
</service>
```
@ -264,6 +265,9 @@ and then install it as a service.
Get-Service VictoriaMetrics | Start-Service
```
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781) for more details.
## Prometheus setup
Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`) in order to send data to VictoriaMetrics:
@ -325,7 +329,7 @@ too high memory consumption of Prometheus, then try to lower `max_samples_per_se
Keep in mind that these two params are tightly connected.
Read more about tuning remote write for Prometheus [here](https://prometheus.io/docs/practices/remote_write).
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer,
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases/latest) or newer,
since previous versions may have issues with `remote_write`.
Take a look also at [vmagent](https://docs.victoriametrics.com/vmagent.html)
@ -352,9 +356,11 @@ See more in [description](https://github.com/VictoriaMetrics/grafana-datasource#
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking [the CHANGELOG page](https://docs.victoriametrics.com/CHANGELOG.html) and performing regular upgrades.
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise.
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
The following steps must be performed during the upgrade / downgrade procedure:
@ -533,7 +539,7 @@ dd_url: http://victoriametrics:8428/datadog
</div>
vmagent also can accept Datadog metrics format. Depending on where vmagent will forward data,
pick [single-node or cluster URL]((https://docs.victoriametrics.com/url-examples.html#datadog)) formats.
pick [single-node or cluster URL](https://docs.victoriametrics.com/url-examples.html#datadog) formats.
### Sending metrics to Datadog and VictoriaMetrics
@ -851,71 +857,76 @@ For example, `/api/put?extra_label=foo=bar` would add `{foo="bar"}` label to all
## How to send data from NewRelic agent
VictoriaMetrics accepts data from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent)
at `/api/v1/newrelic/infra/v2/metrics/events/bulk` path.
NewRelic's infrastructure agent sends so-called [Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
which then transformed by VictoriaMetrics to the [Prometheus exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format).
at `/newrelic/infra/v2/metrics/events/bulk` HTTP path.
VictoriaMetrics receives [Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
from NewRelic agent at the given path, transforms them to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
according to [these docs](#newrelic-agent-data-mapping) before storing the raw samples to the database.
NewRelic's infrastructure agent allows configuring destinations for metrics forwarding via ENV variable `COLLECTOR_URL`.
It is also required to specify `NRIA_LICENSE_KEY`, which is available only after registration into account of the NewRelic cloud.
You need passing `COLLECTOR_URL` and `NRIA_LICENSE_KEY` environment variables to NewRelic infrastructure agent in order to send the collected metrics to VictoriaMetrics.
The `COLLECTOR_URL` must point to `/newrelic` HTTP endpoint at VictoriaMetrics, while the `NRIA_LICENSE_KEY` must contain NewRelic license key,
which can be obtained [here](https://newrelic.com/signup).
For example, if VictoriaMetrics runs at `localhost:8428`, then the following command can be used for running NewRelic infrastructure agent:
To configure NewRelic infrastructure agent for forwarding metrics to VictoriaMetrics use the following example:
```console
COLLECTOR_URL="http://localhost:8428/newrelic/api/v1" NRIA_LICENSE_KEY="YOUR_LICENSE_KEY" ./newrelic-infra
COLLECTOR_URL="http://localhost:8428/newrelic" NRIA_LICENSE_KEY="NEWRELIC_LICENSE_KEY" ./newrelic-infra
```
### NewRelic agent data mapping
### NewRelic agent data mapping
VictoriaMetrics maps [NewRelic Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) in the following way:
1. Every numeric field is converted into a raw sample with the corresponding name.
1. The `eventType` and all the other fields with `string` value type are attached to every raw sample as [metric labels](https://docs.victoriametrics.com/keyConcepts.html#labels).
1. The `timestamp` field is used as timestamp for the ingested [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples).
The `timestamp` field may be specified either in seconds or in milliseconds since the [Unix Epoch](https://en.wikipedia.org/wiki/Unix_time).
If the `timestamp` field is missing, then the raw sample is stored with the current timestamp.
For example, let's import the following NewRelic Events request to VictoriaMetrics:
As example, lets create `newrelic.json` file with the following content:
```json
[
{
"Events":[
{
"eventType":"SystemSample",
"entityKey":"macbook-pro.local",
"cpuPercent":25.056660790748904,
"cpuUserPercent":8.687987912389374,
"cpuSystemPercent":16.36867287835953,
"cpuIOWaitPercent":0,
"cpuIdlePercent":74.94333920925109,
"cpuStealPercent":0,
"loadAverageOneMinute":5.42333984375,
"loadAverageFiveMinute":4.099609375,
"loadAverageFifteenMinute":3.58203125
}
]
}
]
{
"Events":[
{
"eventType":"SystemSample",
"entityKey":"macbook-pro.local",
"cpuPercent":25.056660790748904,
"cpuUserPercent":8.687987912389374,
"cpuSystemPercent":16.36867287835953,
"cpuIOWaitPercent":0,
"cpuIdlePercent":74.94333920925109,
"cpuStealPercent":0,
"loadAverageOneMinute":5.42333984375,
"loadAverageFiveMinute":4.099609375,
"loadAverageFifteenMinute":3.58203125
}
]
}
]
```
Let's use cUrl to send `newrelic.json` to single-node VictoriaMetrics:
Save this JSON into `newrelic.json` file and then use the following command in order to import it into VictoriaMetrics:
```console
curl -X POST -H 'Content-Type: application/json' --data-binary @newrelic.json http://localhost:8428/newrelic/api/v1/infra/v2/metrics/events/bulk
curl -X POST -H 'Content-Type: application/json' --data-binary @newrelic.json http://localhost:8428/newrelic/infra/v2/metrics/events/bulk
```
If data was successfully ingested, you'll get `{"status":"ok"}` response. Let's fetch ingested data from VictoriaMetrics
in vmui via query `{__name__!=""}`:
Let's fetch the ingested data via [data export API](#how-to-export-data-in-json-line-format):
```console
system_sample_cpu_io_wait_percent{entity_key="macbook-pro.local"} 0
system_sample_cpu_idle_percent{entity_key="macbook-pro.local"} 74.9433392092
system_sample_cpu_percent{entity_key="macbook-pro.local"} 25.056660790748
system_sample_cpu_steal_percent{entity_key="macbook-pro.local"} 0
system_sample_cpu_system_percent{entity_key="macbook-pro.local"} 16.368672878359
system_sample_cpu_user_percent{entity_key="macbook-pro.local"} 8.687987912389
system_sample_load_average_fifteen_minute{entity_key="macbook-pro.local"} 3.58203125
system_sample_load_average_five_minute{entity_key="macbook-pro.local"} 4.099609375
system_sample_load_average_one_minute{entity_key="macbook-pro.local"} 5.42333984375
curl http://localhost:8428/api/v1/export -d 'match={eventType="SystemSample"}'
{"metric":{"__name__":"cpuStealPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageFiveMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[4.099609375],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuIOWaitPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuSystemPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[16.368672878359],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageOneMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[5.42333984375],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuUserPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[8.687987912389],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuIdlePercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[74.9433392092],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageFifteenMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[3.58203125],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[25.056660790748],"timestamps":[1697407970000]}
```
The fields in `newrelic.json` are transformed in the following way:
1. `eventType` filed is used as prefix for all metrics in the object;
2. `entityKey` or any other field with `string` value type is used as label attached to all metrics in the object;
3. the rest fields with numeric values will be used as metrics;
4. the additional field `timestamp` can be added to the payload to set the timestamp for all metrics. If omitted,
current time is used.
## Prometheus querying API usage
VictoriaMetrics supports the following handlers from [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/):
@ -1060,7 +1071,7 @@ VictoriaMetrics supports the following handlers from [Graphite Tags API](https:/
## How to build from sources
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) or
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) instead of building VictoriaMetrics
from sources. Building from sources is reasonable when developing additional features specific
to your needs or when testing bugfixes.
@ -1875,7 +1886,8 @@ to historical data.
See [how to configure multiple retentions in VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#retention-filters).
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## Downsampling
@ -1896,7 +1908,8 @@ Please, note that intervals of `-downsampling.period` must be multiples of each
In case [deduplication](https://docs.victoriametrics.com/#deduplication) is enabled value of `-dedup.minScrapeInterval` must also be multiple of `-downsampling.period` intervals.
This is required to ensure consistency of deduplication and downsampling results.
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## Multi-tenancy
@ -1959,7 +1972,7 @@ and [the general security page at VictoriaMetrics website](https://victoriametri
The only option is increasing the limit on [the number of open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a).
The recommendation is not specific for VictoriaMetrics only but also for any service which handles many HTTP connections and stores data on disk.
* VictoriaMetrics is a write-heavy application and its performance depends on disk performance. So be careful with other
applications or utilities (like [fstrim](http://manpages.ubuntu.com/manpages/bionic/man8/fstrim.8.html))
applications or utilities (like [fstrim](https://manpages.ubuntu.com/manpages/lunar/en/man8/fstrim.8.html))
which could [exhaust disk resources](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1521).
* The recommended filesystem is `ext4`, the recommended persistent storage is [persistent HDD-based disk on GCP](https://cloud.google.com/compute/docs/disks/#pdspecs),
since it is protected from hardware failures via internal replication and it can be [resized on the fly](https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd).
@ -2125,7 +2138,7 @@ and [cardinality explorer docs](#cardinality-explorer).
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
since the encountered issue could be already fixed there.
* It is recommended to have at least 50% of spare resources for CPU, disk IO and RAM, so VictoriaMetrics could handle short spikes in the workload without performance issues.
@ -2136,6 +2149,11 @@ and [cardinality explorer docs](#cardinality-explorer).
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
* If you run VictoriaMetrics on a host with 16 or more CPU cores, then it may be needed to tune the `-search.maxWorkersPerQuery` command-line flag
in order to improve query performance. If VictoriaMetrics serves big number of concurrent `select` queries, then try reducing the value for this flag.
If VcitoriaMetrics serves heavy queries, which select `>10K` of [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) and/or process `>100M`
of [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) per query, then try setting the value for this flag to the number of available CPU cores.
* VictoriaMetrics buffers incoming data in memory for up to a few seconds before flushing it to persistent storage.
This may lead to the following "issues":
* Data becomes available for querying in a few seconds after inserting. It is possible to flush in-memory buffers to searchable parts
@ -2172,15 +2190,19 @@ and [cardinality explorer docs](#cardinality-explorer).
This suppresses default gap filling algorithm used by VictoriaMetrics - by default it assumes
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality) or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality)
or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined
via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
* New time series can be logged if `-logNewSeries` command-line flag is passed to VictoriaMetrics.
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag
and drops superflouos labels. This prevents from ingesting metrics with too many labels.
It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics. See [these docs](#selecting-graphite-metrics) for details.
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics.
See [these docs](#selecting-graphite-metrics) for details. You can also query Graphite metrics with [Graphite querying API](#graphite-render-api-usage).
* VictoriaMetrics ignores `NaN` values during data ingestion.
@ -2322,7 +2344,8 @@ See also [high availability docs](#high-availability) and [backup docs](#backups
VictoriaMetrics supports backups via [vmbackup](https://docs.victoriametrics.com/vmbackup.html)
and [vmrestore](https://docs.victoriametrics.com/vmrestore.html) tools.
We also provide [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html) tool for enterprise subscribers.
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## vmalert
@ -2401,7 +2424,7 @@ Feel free asking any questions regarding VictoriaMetrics:
* [Twitter](https://twitter.com/VictoriaMetrics/)
* [Linkedin](https://www.linkedin.com/company/victoriametrics/)
* [Reddit](https://www.reddit.com/r/VictoriaMetrics/)
* [Telegram-en](https://t.me/VictoriaMetrics_en?)
* [Telegram-en](https://t.me/VictoriaMetrics_en)
* [Telegram-ru](https://t.me/VictoriaMetrics_ru1)
* [Google groups](https://groups.google.com/forum/#!forum/victorametrics-users)
* [Mastodon](https://mastodon.social/@victoriametrics/)
@ -2484,7 +2507,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-denyQueryTracing
Whether to disable the ability to trace queries. See https://docs.victoriametrics.com/#query-tracing
-downsampling.period array
Comma-separated downsampling periods in the format 'offset:period'. For example, '30d:10m' instructs to leave a single sample per 10 minutes for samples older than 30 days. When setting multiple downsampling periods, it is necessary for the periods to be multiples of each other. See https://docs.victoriametrics.com/#downsampling for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Comma-separated downsampling periods in the format 'offset:period'. For example, '30d:10m' instructs to leave a single sample per 10 minutes for samples older than 30 days. When setting multiple downsampling periods, it is necessary for the periods to be multiples of each other. See https://docs.victoriametrics.com/#downsampling for details. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-dryRun
Whether to check config files without running VictoriaMetrics. The following config files are checked: -promscrape.config, -relabelConfig and -streamAggr.config. Unknown config entries aren't allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse=false command-line flag
@ -2495,7 +2518,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-finalMergeDelay duration
@ -2568,11 +2591,11 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-logNewSeries
Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics
-loggerDisableTimestamps
@ -2592,7 +2615,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
@ -2630,14 +2653,16 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1)
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberLabel string
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.memberNum string
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
@ -2728,7 +2753,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-relabelConfig string
Optional path to a file with relabeling rules, which are applied to all the ingested metrics. The path can point either to local file or to http url. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal
-retentionFilter array
Retention filter in the format 'filter:retention'. For example, '{env="dev"}:3d' configures the retention for time series with env="dev" label to 3 days. See https://docs.victoriametrics.com/#retention-filters for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Retention filter in the format 'filter:retention'. For example, '{env="dev"}:3d' configures the retention for time series with env="dev" label to 3 days. See https://docs.victoriametrics.com/#retention-filters for details. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-retentionPeriod value
Data with timestamps outside the retentionPeriod is automatically deleted. The minimum retentionPeriod is 24h or 1d. See also -retentionFilter
@ -2806,6 +2831,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
The maximum number of tag values returned from /api/v1/label/<label_name>/values (default 100000)
-search.maxUniqueTimeseries int
The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage (default 300000)
-search.maxWorkersPerQuery int
The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4)
-search.minStalenessInterval duration
The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval'
-search.noStaleMarkers

View file

@ -88,6 +88,12 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
return true
}
if strings.HasPrefix(path, "/vmui/") {
if strings.HasPrefix(path, "/vmui/static/") {
// Allow clients caching static contents for long period of time, since it shouldn't change over time.
// Path to static contents (such as js and css) must be changed whenever its contents is changed.
// See https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/
w.Header().Set("Cache-Control", "max-age=31536000")
}
r.URL.Path = path
vmuiFileServer.ServeHTTP(w, r)
return true

View file

@ -1,14 +1,13 @@
{
"files": {
"main.css": "./static/css/main.3258b6c4.css",
"main.js": "./static/js/main.41b816cc.js",
"main.css": "./static/css/main.9a224445.css",
"main.js": "./static/js/main.02178f4b.js",
"static/js/522.b5ae4365.chunk.js": "./static/js/522.b5ae4365.chunk.js",
"static/media/Lato-Regular.ttf": "./static/media/Lato-Regular.d714fec1633b69a9c2e9.ttf",
"static/media/Lato-Bold.ttf": "./static/media/Lato-Bold.32360ba4b57802daa4d6.ttf",
"static/media/MetricsQL.md": "./static/media/MetricsQL.957b90ab4cb4852eec26.md",
"index.html": "./index.html"
},
"entrypoints": [
"static/css/main.3258b6c4.css",
"static/js/main.41b816cc.js"
"static/css/main.9a224445.css",
"static/js/main.02178f4b.js"
]
}

View file

@ -1 +1 @@
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.41b816cc.js"></script><link href="./static/css/main.3258b6c4.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.02178f4b.js"></script><link href="./static/css/main.9a224445.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -46,7 +46,7 @@ additionally to [discovering Prometheus-compatible targets and scraping metrics
## Quick Start
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) (
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) (
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags)),
unpack it and pass the following flags to the `vmagent` binary in order to start scraping Prometheus-compatible targets
and sending the data to the Prometheus-compatible remote storage:
@ -1025,7 +1025,7 @@ See also [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting
* [Reading metrics from Kafka](#reading-metrics-from-kafka)
* [Writing metrics to Kafka](#writing-metrics-to-kafka)
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
in `vmutils-...-enterprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
@ -1074,7 +1074,7 @@ data_format = "influx"
#### Command-line flags for Kafka consumer
These command-line flags are available only in [enterprise](https://docs.victoriametrics.com/enterprise.html) version of `vmagent`,
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
(see `vmutils-...-enterprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
```
@ -1134,7 +1134,7 @@ Two types of auth are supported:
## How to build from sources
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-...` archives.
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - `vmagent` is located in the `vmutils-...` archives.
It may be needed to build `vmagent` from source code when developing or testing new feature or bugfix.
@ -1246,7 +1246,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -1309,40 +1311,40 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-kafka.consumer.topic array
Kafka topic names for data consumption. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Kafka topic names for data consumption. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.basicAuth.password array
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.basicAuth.username array
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.brokers array
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.concurrency array
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default 1)
Supports array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.defaultFormat string
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
-kafka.consumer.topic.format array
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.groupID array
Defines group.id for topic. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Defines group.id for topic. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.isGzipped array
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.options array
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@ -1360,7 +1362,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
@ -1393,15 +1395,15 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberLabel string
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.memberNum string
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel (default "0")
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
@ -1551,7 +1553,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
-remoteWrite.maxDiskUsagePerURL array
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. Disk usage is unlimited if the value is set to 0
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB.
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB. (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.maxHourlySeries int
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
@ -1581,28 +1583,28 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-remoteWrite.queues int
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
-remoteWrite.rateLimit array
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.relabelConfig string
Optional path to file with relabeling configs, which are applied to all the metrics before sending them to -remoteWrite.url. See also -remoteWrite.urlRelabelConfig. The path can point either to local file or to http url. See https://docs.victoriametrics.com/vmagent.html#relabeling
-remoteWrite.roundDigits array
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics (default 100)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.sendTimeout array
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m0s)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.shardByURL
Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages
-remoteWrite.showURL
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-remoteWrite.significantFigures array
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.config array
Optional path to file with stream aggregation config. See https://docs.victoriametrics.com/stream-aggregation.html . See also -remoteWrite.streamAggr.keepInput, -remoteWrite.streamAggr.dropInput and -remoteWrite.streamAggr.dedupInterval
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.dedupInterval array
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero (default 0s)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.dropInput array
Whether to drop all the input samples after the aggregation with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/stream-aggregation.html

View file

@ -8,7 +8,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog/stream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
"github.com/VictoriaMetrics/metrics"
@ -29,12 +29,12 @@ func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
return err
}
ce := req.Header.Get("Content-Encoding")
return stream.Parse(req.Body, ce, func(series []parser.Series) error {
return stream.Parse(req.Body, ce, func(series []datadog.Series) error {
return insertRows(at, series, extraLabels)
})
}
func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmarshal.Label) error {
func insertRows(at *auth.Token, series []datadog.Series, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@ -63,7 +63,7 @@ func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmars
})
}
for _, tag := range ss.Tags {
name, value := parser.SplitTag(tag)
name, value := datadog.SplitTag(tag)
if name == "host" {
name = "exported_host"
}

View file

@ -322,19 +322,19 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusOK)
return true
case "/newrelic/api/v1":
case "/newrelic":
newrelicCheckRequest.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/newrelic/api/v1/inventory/deltas":
case "/newrelic/inventory/deltas":
newrelicInventoryRequests.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
return true
case "/newrelic/api/v1/infra/v2/metrics/events/bulk":
case "/newrelic/infra/v2/metrics/events/bulk":
newrelicWriteRequests.Inc()
if err := newrelic.InsertHandlerForHTTP(nil, r); err != nil {
newrelicWriteErrors.Inc()
@ -548,19 +548,19 @@ func processMultitenantRequest(w http.ResponseWriter, r *http.Request, path stri
}
w.WriteHeader(http.StatusOK)
return true
case "/newrelic/api/v1":
case "newrelic":
newrelicCheckRequest.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/newrelic/api/v1/inventory/deltas":
case "newrelic/inventory/deltas":
newrelicInventoryRequests.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
return true
case "/newrelic/api/v1/infra/v2/metrics/events/bulk":
case "newrelic/infra/v2/metrics/events/bulk":
newrelicWriteRequests.Inc()
if err := newrelic.InsertHandlerForHTTP(at, r); err != nil {
newrelicWriteErrors.Inc()
@ -643,11 +643,11 @@ var (
opentelemetryPushRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
opentelemetryPushErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/api/v1/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1/inventory/deltas", protocol="newrelic"}`)
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1", protocol="newrelic"}`)
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/inventory/deltas", protocol="newrelic"}`)
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic", protocol="newrelic"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeServiceDiscoveryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/service-discovery"}`)

View file

@ -8,6 +8,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/newrelic"
@ -29,42 +30,48 @@ func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
}
ce := req.Header.Get("Content-Encoding")
isGzip := ce == "gzip"
return stream.Parse(req.Body, isGzip, func(series []newrelic.Metric) error {
return insertRows(at, series, extraLabels)
return stream.Parse(req.Body, isGzip, func(rows []newrelic.Row) error {
return insertRows(at, rows, extraLabels)
})
}
func insertRows(at *auth.Token, rows []newrelic.Metric, extraLabels []prompbmarshal.Label) error {
func insertRows(at *auth.Token, rows []newrelic.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
rowsTotal := 0
samplesCount := 0
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
tags := r.Tags
srcSamples := r.Samples
for j := range srcSamples {
s := &srcSamples[j]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
Name: "__name__",
Value: bytesutil.ToUnsafeString(s.Name),
})
for k := range tags {
t := &tags[k]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(t.Key),
Value: bytesutil.ToUnsafeString(t.Value),
})
}
samples = append(samples, prompbmarshal.Sample{
Value: s.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
labels = append(labels, extraLabels...)
}
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
labels = append(labels, extraLabels...)
samplesCount += len(srcSamples)
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
@ -72,8 +79,8 @@ func insertRows(at *auth.Token, rows []newrelic.Metric, extraLabels []prompbmars
remotewrite.Push(at, &ctx.WriteRequest)
rowsInserted.Add(len(rows))
if at != nil {
rowsTenantInserted.Get(at).Add(rowsTotal)
rowsTenantInserted.Get(at).Add(samplesCount)
}
rowsPerInsert.Update(float64(len(rows)))
rowsPerInsert.Update(float64(samplesCount))
return nil
}

View file

@ -323,26 +323,32 @@ func (c *client) runWorker() {
}
func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
req := c.newRequest(url, body)
req, err := c.newRequest(url, body)
if err != nil {
return nil, err
}
resp, err := c.hc.Do(req)
if err != nil && errors.Is(err, io.EOF) {
// it is likely connection became stale.
// So we do one more attempt in hope request will succeed.
// If not, the error should be handled by the caller as usual.
// This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139
req = c.newRequest(url, body)
req, _ = c.newRequest(url, body)
resp, err = c.hc.Do(req)
}
return resp, err
}
func (c *client) newRequest(url string, body []byte) *http.Request {
func (c *client) newRequest(url string, body []byte) (*http.Request, error) {
reqBody := bytes.NewBuffer(body)
req, err := http.NewRequest(http.MethodPost, url, reqBody)
if err != nil {
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", url, err)
}
c.authCfg.SetHeaders(req, true)
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
h := req.Header
h.Set("User-Agent", "vmagent")
h.Set("Content-Type", "application/x-protobuf")
@ -360,7 +366,7 @@ func (c *client) newRequest(url string, body []byte) *http.Request {
logger.Warnf("cannot sign remoteWrite request with AWS sigv4: %s", err)
}
}
return req
return req, nil
}
// sendBlockHTTP sends the given block to c.remoteWriteURL.

View file

@ -759,13 +759,12 @@ func (rwctx *remoteWriteCtx) pushInternal(tss []prompbmarshal.TimeSeries) {
}
func (rwctx *remoteWriteCtx) reinitStreamAggr() {
sas := rwctx.sas.Load()
if sas == nil {
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
if sasFile == "" {
// There is no stream aggregation for rwctx
return
}
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", sasFile)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, sasFile)).Inc()
dedupInterval := streamAggrDedupInterval.GetOptionalArg(rwctx.idx)
@ -776,6 +775,7 @@ func (rwctx *remoteWriteCtx) reinitStreamAggr() {
logger.Errorf("cannot reload stream aggregation config from -remoteWrite.streamAggr.config=%q; continue using the previously loaded config; error: %s", sasFile, err)
return
}
sas := rwctx.sas.Load()
if !sasNew.Equal(sas) {
sasOld := rwctx.sas.Swap(sasNew)
sasOld.MustStop()

103
app/vmalert-tool/Makefile Normal file
View file

@ -0,0 +1,103 @@
# All these commands must run from repository root.
vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) app-local
vmalert-tool-race:
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
vmalert-tool-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker
vmalert-tool-pure-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
vmalert-tool-linux-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
vmalert-tool-linux-arm-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
vmalert-tool-linux-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
vmalert-tool-linux-ppc64le-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
vmalert-tool-linux-386-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
vmalert-tool-darwin-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
vmalert-tool-darwin-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
vmalert-tool-freebsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
vmalert-tool-openbsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
vmalert-tool-windows-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
package-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) package-via-docker
package-vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
package-vmalert-tool-amd64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
package-vmalert-tool-arm:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
package-vmalert-tool-arm64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
package-vmalert-tool-ppc64le:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
package-vmalert-tool-386:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
publish-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
vmalert-tool-linux-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-ppc64le:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
vmalert-tool-linux-s390x:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
vmalert-tool-linux-386:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-freebsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-openbsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-windows-amd64:
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) app-local-pure

246
app/vmalert-tool/README.md Normal file
View file

@ -0,0 +1,246 @@
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```yaml
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```yaml
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```yaml
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```yaml
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```yaml
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```yaml
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```yaml
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```yaml
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```yaml
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

54
app/vmalert-tool/main.go Normal file
View file

@ -0,0 +1,54 @@
package main
import (
"fmt"
"log"
"os"
"time"
"github.com/urfave/cli/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
)
func main() {
start := time.Now()
app := &cli.App{
Name: "vmalert-tool",
Usage: "VMAlert command-line tool",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "unittest",
Usage: "Run unittest for alerting and recording rules.",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
Flags: []cli.Flag{
&cli.StringSliceFlag{
Name: "files",
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
Required: true,
},
&cli.BoolFlag{
Name: "disableAlertgroupLabel",
Usage: "disable adding group's Name as label to generated alerts and time series.",
Required: false,
},
},
Action: func(c *cli.Context) error {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
return fmt.Errorf("unittest failed")
}
return nil
},
},
},
}
err := app.Run(os.Args)
if err != nil {
log.Fatalln(err)
}
log.Printf("Total time: %v", time.Since(start))
}

View file

@ -0,0 +1,19 @@
package unittest
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
// alertTestCase holds alert_rule_test cases defined in test file
type alertTestCase struct {
EvalTime *promutils.Duration `yaml:"eval_time"`
GroupName string `yaml:"groupname"`
Alertname string `yaml:"alertname"`
ExpAlerts []expAlert `yaml:"exp_alerts"`
}
// expAlert holds exp_alerts defined in test file
type expAlert struct {
ExpLabels map[string]string `yaml:"exp_labels"`
ExpAnnotations map[string]string `yaml:"exp_annotations"`
}

View file

@ -0,0 +1,182 @@
package unittest
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strconv"
"strings"
"time"
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// series holds input_series defined in the test file
type series struct {
Series string `yaml:"series"`
Values string `yaml:"values"`
}
// sequenceValue is an omittable value in a sequence of time series values.
type sequenceValue struct {
Value float64
Omitted bool
}
func httpWrite(address string, r io.Reader) {
resp, err := http.Post(address, "", r)
if err != nil {
logger.Fatalf("failed to send to storage: %v", err)
}
resp.Body.Close()
}
// writeInputSeries send input series to vmstorage and flush them
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
r := testutil.WriteRequest{}
for _, data := range input {
expr, err := metricsql.Parse(data.Series)
if err != nil {
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
}
promvals, err := parseInputValue(data.Values, true)
if err != nil {
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
}
metricExpr, ok := expr.(*metricsql.MetricExpr)
if !ok {
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
}
samples := make([]testutil.Sample, 0, len(promvals))
ts := startStamp
for _, v := range promvals {
if !v.Omitted {
samples = append(samples, testutil.Sample{
Timestamp: ts.UnixMilli(),
Value: v.Value,
})
}
ts = ts.Add(interval.Duration())
}
var ls []testutil.Label
for _, filter := range metricExpr.LabelFilterss[0] {
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
}
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
}
data, err := testutil.Compress(r)
if err != nil {
return fmt.Errorf("failed to compress data: %v", err)
}
// write input series to vm
httpWrite(dst, bytes.NewBuffer(data))
vmstorage.Storage.DebugFlush()
return nil
}
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
var res []sequenceValue
items := strings.Split(input, " ")
reg := regexp.MustCompile(`\D?\d*\D?`)
for _, item := range items {
if item == "stale" {
res = append(res, sequenceValue{Value: decimal.StaleNaN})
continue
}
vals := reg.FindAllString(item, -1)
switch len(vals) {
case 1:
if vals[0] == "_" {
res = append(res, sequenceValue{Omitted: true})
continue
}
v, err := strconv.ParseFloat(vals[0], 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v})
continue
case 2:
p1 := vals[0][:len(vals[0])-1]
v2, err := strconv.ParseInt(vals[1], 10, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
switch option {
case '+':
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v1 + float64(v2)})
case 'x':
for i := int64(0); i <= v2; i++ {
if p1 == "_" {
if i == 0 {
i = 1
}
res = append(res, sequenceValue{Omitted: true})
continue
}
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
if !origin || v1 == 0 {
res = append(res, sequenceValue{Value: v1 * float64(i)})
continue
}
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
newRes, err := parseInputValue(newVal, false)
if err != nil {
return nil, err
}
res = append(res, newRes...)
break
}
default:
return nil, fmt.Errorf("got invalid operation %b", option)
}
case 3:
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
if err != nil {
return nil, err
}
p1 := vals[0][:len(vals[0])-1]
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
var isAdd bool
if option == '+' {
isAdd = true
}
for _, r := range r1 {
if isAdd {
res = append(res, sequenceValue{
Value: r.Value + v1,
})
} else {
res = append(res, sequenceValue{
Value: v1 - r.Value,
})
}
}
default:
return nil, fmt.Errorf("unsupported input %s", input)
}
}
return res, nil
}

View file

@ -0,0 +1,93 @@
package unittest
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
)
func TestParseInputValue(t *testing.T) {
testCases := []struct {
input string
exp []sequenceValue
failed bool
}{
{
"",
nil,
true,
},
{
"testfailed",
nil,
true,
},
// stale doesn't support operations
{
"stalex3",
nil,
true,
},
{
"-4",
[]sequenceValue{{Value: -4}},
false,
},
{
"_",
[]sequenceValue{{Omitted: true}},
false,
},
{
"stale",
[]sequenceValue{{Value: decimal.StaleNaN}},
false,
},
{
"-4x1",
[]sequenceValue{{Value: -4}, {Value: -4}},
false,
},
{
"_x1",
[]sequenceValue{{Omitted: true}},
false,
},
{
"1+1x4",
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
false,
},
{
"2-1x4",
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
false,
},
{
"1+1x1 _ -4 stale 3+20x1",
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
false,
},
}
for _, tc := range testCases {
output, err := parseInputValue(tc.input, true)
if err != nil != tc.failed {
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
}
if len(tc.exp) != len(output) {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
for i := 0; i < len(tc.exp); i++ {
if tc.exp[i].Omitted != output[i].Omitted {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
if tc.exp[i].Value != output[i].Value {
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
continue
}
t.Fatalf("expect %v, got %v", tc.exp, output)
}
}
}
}

View file

@ -0,0 +1,100 @@
package unittest
import (
"context"
"fmt"
"net/url"
"reflect"
"sort"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
type metricsqlTestCase struct {
Expr string `yaml:"expr"`
EvalTime *promutils.Duration `yaml:"eval_time"`
ExpSamples []expSample `yaml:"exp_samples"`
}
type expSample struct {
Labels string `yaml:"labels"`
Value float64 `yaml:"value"`
}
// checkMetricsqlCase will check metricsql_expr_test cases
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
Outer:
for _, mt := range cases {
result, _, err := queries.Query(context.Background(), mt.Expr, durationToTime(mt.EvalTime))
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
mt.EvalTime.Duration().String(), err))
continue
}
var gotSamples []parsedSample
for _, s := range result.Data {
sort.Slice(s.Labels, func(i, j int) bool {
return s.Labels[i].Name < s.Labels[j].Name
})
gotSamples = append(gotSamples, parsedSample{
Labels: s.Labels,
Value: s.Values[0],
})
}
var expSamples []parsedSample
for _, s := range mt.ExpSamples {
expLb := datasource.Labels{}
if s.Labels != "" {
metricsqlExpr, err := metricsql.Parse(s.Labels)
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
continue Outer
}
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
if !ok {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
continue Outer
}
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
expLb = append(expLb, datasource.Label{
Name: l.Label,
Value: l.Value,
})
}
}
sort.Slice(expLb, func(i, j int) bool {
return expLb[i].Name < expLb[j].Name
})
expSamples = append(expSamples, parsedSample{
Labels: expLb,
Value: s.Value,
})
}
sort.Slice(expSamples, func(i, j int) bool {
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
})
sort.Slice(gotSamples, func(i, j int) bool {
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
})
if !reflect.DeepEqual(expSamples, gotSamples) {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
}
}
return
}
func durationToTime(pd *promutils.Duration) time.Time {
if pd == nil {
return time.Time{}
}
return time.UnixMilli(pd.Duration().Milliseconds())
}

View file

@ -0,0 +1,43 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,49 @@
rule_files:
- rules.yaml
tests:
- interval: 1m
name: "Failing test"
input_series:
- series: test
values: "0"
metricsql_expr_test:
- expr: test
eval_time: 0m
exp_samples:
- value: 0
labels: test
# will failed cause there is no "Test" group and rule defined
alert_rule_test:
- eval_time: 0m
groupname: Test
alertname: Test
exp_alerts:
- exp_labels: {}
- interval: 1m
name: Failing alert test
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause rule is firing
- eval_time: 5m
groupname: group1
alertname: InstanceDown
exp_alerts: []
- interval: 1m
name: Failing alert test with missing groupname
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause missing groupname
- eval_time: 5m
alertname: AlwaysFiring
exp_alerts: []

View file

@ -0,0 +1,30 @@
# can be executed successfully but will take more than 1 minute
# not included in unit test now
evaluation_interval: 100d
rule_files:
- rules.yaml
tests:
- interval: 1d
input_series:
- series: test
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
# We input series to 2024-01-01T00:00:00 here.
values: "0+1x19723"
metricsql_expr_test:
- expr: timestamp(test)
eval_time: 0m
exp_samples:
- value: 0
- expr: test
eval_time: 100d
exp_samples:
- labels: test
value: 100
- expr: timestamp(test)
eval_time: 19000d
exp_samples:
- value: 1641600000 # 19000d -> seconds.

View file

@ -0,0 +1,39 @@
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 1m
- name: group2
rules:
- record: t1
expr: test
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 5m
- name: group3
rules:
- record: t2
expr: t1
- name: group4
rules:
- record: t3
expr: t1

View file

@ -0,0 +1,99 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
group_eval_order: ["group4", "group2", "group3"]
tests:
- interval: 1m
name: "basic test"
input_series:
- series: "test"
values: "_x5 1x5 _ stale"
alert_rule_test:
- eval_time: 1m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- eval_time: 1m
groupname: group2
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
- eval_time: 6m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
metricsql_expr_test:
- expr: test
eval_time: 11m
exp_samples:
- labels: '{__name__="test"}'
value: 1
- expr: test
eval_time: 12m
exp_samples: []
- interval: 1m
name: "basic test2"
input_series:
- series: 'up{job="vmagent1", instance="localhost:9090"}'
values: "0+0x1440"
- series: "test"
values: "0+1x1440"
metricsql_expr_test:
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
eval_time: 4m
exp_samples:
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
value: 1
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
value: 1
- expr: t1
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t1", datacenter="dc-123"}'
- expr: t2
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t2", datacenter="dc-123"}'
- expr: t3
eval_time: 4m
exp_samples:
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
- value: 3
labels: '{__name__="t3", datacenter="dc-123"}'
alert_rule_test:
- eval_time: 10m
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent1
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: alerts
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,46 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,83 @@
package unittest
import (
"fmt"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// parsedSample is a sample with parsed Labels
type parsedSample struct {
Labels datasource.Labels
Value float64
}
func (ps *parsedSample) String() string {
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
}
func parsedSamplesString(pss []parsedSample) string {
if len(pss) == 0 {
return "nil"
}
s := pss[0].String()
for _, ps := range pss[1:] {
s += ", " + ps.String()
}
return s
}
// labelAndAnnotation holds labels and annotations
type labelAndAnnotation struct {
Labels datasource.Labels
Annotations datasource.Labels
}
func (la *labelAndAnnotation) String() string {
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
}
// labelsAndAnnotations is collection of LabelAndAnnotation
type labelsAndAnnotations []labelAndAnnotation
func (la labelsAndAnnotations) Len() int { return len(la) }
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
func (la labelsAndAnnotations) Less(i, j int) bool {
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
if diff != 0 {
return diff < 0
}
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
}
func (la labelsAndAnnotations) String() string {
if len(la) == 0 {
return "[]"
}
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
for i, l := range la[1:] {
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
}
s += "\n]"
return s
}
// indentLines prefixes each line in the supplied string with the given "indent" string.
func indentLines(lines, indent string) string {
sb := strings.Builder{}
n := strings.Split(lines, "\n")
for i, l := range n {
if i > 0 {
sb.WriteString(indent)
}
sb.WriteString(l)
if i != len(n)-1 {
sb.WriteRune('\n')
}
}
return sb.String()
}

View file

@ -0,0 +1,443 @@
package unittest
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"reflect"
"sort"
"time"
"gopkg.in/yaml.v2"
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metrics"
)
var (
storagePath string
httpListenAddr = ":8880"
// insert series from 1970-01-01T00:00:00
testStartTime = time.Unix(0, 0).UTC()
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
disableAlertgroupLabel bool
)
const (
testStoragePath = "vmalert-unittest"
testLogLevel = "ERROR"
)
// UnitTest runs unittest for files
func UnitTest(files []string, disableGroupLabel bool) bool {
if err := templates.Load([]string{}, true); err != nil {
logger.Fatalf("failed to load template: %v", err)
}
storagePath = filepath.Join(os.TempDir(), testStoragePath)
processFlags()
vminsert.Init()
vmselect.Init()
// storagePath will be created again when closing vmselect, so remove it again.
defer fs.MustRemoveAll(storagePath)
defer vminsert.Stop()
defer vmselect.Stop()
disableAlertgroupLabel = disableGroupLabel
return rulesUnitTest(files)
}
func rulesUnitTest(files []string) bool {
var failed bool
for _, f := range files {
if err := ruleUnitTest(f); err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
failed = true
} else {
fmt.Println(" SUCCESS")
}
}
return failed
}
func ruleUnitTest(filename string) []error {
fmt.Println("\nUnit Testing: ", filename)
b, err := os.ReadFile(filename)
if err != nil {
return []error{fmt.Errorf("failed to read file: %w", err)}
}
var unitTestInp unitTestFile
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
}
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
}
if unitTestInp.EvaluationInterval.Duration() == 0 {
fmt.Println("evaluation_interval set to 1m by default")
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
}
groupOrderMap := make(map[string]int)
for i, gn := range unitTestInp.GroupEvalOrder {
if _, ok := groupOrderMap[gn]; ok {
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
}
groupOrderMap[gn] = i
}
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
if err != nil {
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
}
var errs []error
for _, t := range unitTestInp.Tests {
if err := verifyTestGroup(t); err != nil {
errs = append(errs, err)
continue
}
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
errs = append(errs, testErrs...)
}
if len(errs) > 0 {
return errs
}
return nil
}
func verifyTestGroup(group testGroup) error {
var testGroupName string
if group.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
}
for _, at := range group.AlertRuleTests {
if at.Alertname == "" {
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
}
if !disableAlertgroupLabel && at.GroupName == "" {
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
}
if disableAlertgroupLabel && at.GroupName != "" {
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
}
if at.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
for _, et := range group.MetricsqlExprTests {
if et.Expr == "" {
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
}
if et.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
return nil
}
func processFlags() {
flag.Parse()
for _, fv := range []struct {
flag string
value string
}{
{flag: "storageDataPath", value: storagePath},
{flag: "loggerLevel", value: testLogLevel},
{flag: "search.disableCache", value: "true"},
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
{flag: "retentionPeriod", value: "100y"},
{flag: "datasource.url", value: testDataSourcePath},
{flag: "remoteWrite.url", value: testRemoteWritePath},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
}
}
}
func setUp() {
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/prometheus/api/v1/query":
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
case "/prometheus/api/v1/write", "/api/v1/write":
if err := promremotewrite.InsertHandler(r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
default:
}
return false
})
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
readyCheckFunc := func() bool {
resp, err := http.Get(testHealthHTTPPath)
if err != nil {
return false
}
_ = resp.Body.Close()
return resp.StatusCode == 200
}
checkCheck:
for {
select {
case <-ctx.Done():
logger.Fatalf("http server can't be ready in 30s")
default:
if readyCheckFunc() {
break checkCheck
}
time.Sleep(3 * time.Second)
}
}
}
func tearDown() {
if err := httpserver.Stop(httpListenAddr); err != nil {
logger.Errorf("cannot stop the webservice: %s", err)
}
vmstorage.Stop()
metrics.UnregisterAllMetrics()
fs.MustRemoveAll(storagePath)
}
// resolveAndGlobFilepaths joins all relative paths in a configuration
// with a given base directory and replaces all globs with matching files.
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
for i, rf := range utf.RuleFiles {
if rf != "" && !filepath.IsAbs(rf) {
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
}
}
var globbedFiles []string
for _, rf := range utf.RuleFiles {
m, err := filepath.Glob(rf)
if err != nil {
return err
}
if len(m) == 0 {
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
}
globbedFiles = append(globbedFiles, m...)
}
utf.RuleFiles = globbedFiles
return nil
}
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
// set up vmstorage and http server for ingest and read queries
setUp()
// tear down vmstorage and clean the data dir
defer tearDown()
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
if err != nil {
return []error{err}
}
q, err := datasource.Init(nil)
if err != nil {
return []error{fmt.Errorf("failed to init datasource: %v", err)}
}
rw, err := remotewrite.NewDebugClient()
if err != nil {
return []error{fmt.Errorf("failed to init wr: %v", err)}
}
alertEvalTimesMap := map[time.Duration]struct{}{}
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
for _, at := range tg.AlertRuleTests {
et := at.EvalTime.Duration()
alertEvalTimesMap[et] = struct{}{}
if _, ok := alertExpResultMap[et]; !ok {
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
}
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
}
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
}
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
for k := range alertEvalTimesMap {
alertEvalTimes = append(alertEvalTimes, k)
}
sort.Slice(alertEvalTimes, func(i, j int) bool {
return alertEvalTimes[i] < alertEvalTimes[j]
})
// sort group eval order according to the given "group_eval_order".
sort.Slice(testGroups, func(i, j int) bool {
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
})
// create groups with given rule
var groups []*rule.Group
for _, group := range testGroups {
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
groups = append(groups, ng)
}
evalIndex := 0
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
for _, g := range groups {
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
for err := range errs {
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
ts, err))
}
}
// flush series after each group evaluation
vmstorage.Storage.DebugFlush()
}
// check alert_rule_test case at every eval time
for evalIndex < len(alertEvalTimes) {
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
break
}
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
for _, g := range groups {
if disableAlertgroupLabel {
g.Name = ""
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
continue
}
if _, ok := gotAlertsMap[g.Name]; !ok {
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
}
for _, r := range g.Rules {
ar, isAlertRule := r.(*rule.AlertingRule)
if !isAlertRule {
continue
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
for _, got := range ar.GetAlerts() {
if got.State != notifier.StateFiring {
continue
}
if disableAlertgroupLabel {
delete(got.Labels, "alertgroup")
}
laa := labelAndAnnotation{
Labels: datasource.ConvertToLabels(got.Labels),
Annotations: datasource.ConvertToLabels(got.Annotations),
}
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
}
}
}
}
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
for alertname, res := range gres {
var expAlerts labelsAndAnnotations
for _, expAlert := range res {
if expAlert.ExpLabels == nil {
expAlert.ExpLabels = make(map[string]string)
}
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
if !disableAlertgroupLabel {
expAlert.ExpLabels["alertgroup"] = groupname
}
// alertNameLabel is added as additional labels in vmalert.
expAlert.ExpLabels["alertname"] = alertname
expAlerts = append(expAlerts, labelAndAnnotation{
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
})
}
sort.Sort(expAlerts)
gotAlerts := gotAlertsMap[groupname][alertname]
sort.Sort(gotAlerts)
if !reflect.DeepEqual(expAlerts, gotAlerts) {
var testGroupName string
if tg.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
}
expString := indentLines(expAlerts.String(), " ")
gotString := indentLines(gotAlerts.String(), " ")
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
}
}
}
evalIndex++
}
}
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
return checkErrs
}
// unitTestFile holds the contents of a single unit test file
type unitTestFile struct {
RuleFiles []string `yaml:"rule_files"`
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
GroupEvalOrder []string `yaml:"group_eval_order"`
Tests []testGroup `yaml:"tests"`
}
// testGroup is a group of input series and test cases associated with it
type testGroup struct {
Interval *promutils.Duration `yaml:"interval"`
InputSeries []series `yaml:"input_series"`
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
ExternalLabels map[string]string `yaml:"external_labels"`
TestGroupName string `yaml:"name"`
}
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
func (tg *testGroup) maxEvalTime() time.Duration {
var maxd time.Duration
for _, alert := range tg.AlertRuleTests {
if alert.EvalTime.Duration() > maxd {
maxd = alert.EvalTime.Duration()
}
}
for _, met := range tg.MetricsqlExprTests {
if met.EvalTime.Duration() > maxd {
maxd = met.EvalTime.Duration()
}
}
return maxd
}

View file

@ -0,0 +1,47 @@
package unittest
import (
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUnitRule(t *testing.T) {
testCases := []struct {
name string
disableGroupLabel bool
files []string
failed bool
}{
{
name: "run multi files",
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
failed: false,
},
{
name: "disable group label",
disableGroupLabel: true,
files: []string{"./testdata/disable-group-label.yaml"},
failed: false,
},
{
name: "failing test",
files: []string{"./testdata/failed-test.yaml"},
failed: true,
},
}
for _, tc := range testCases {
fail := UnitTest(tc.files, tc.disableGroupLabel)
if fail != tc.failed {
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
}
}
}

View file

@ -439,7 +439,7 @@ If `-clusterMode` is enabled and the `tenant` in a particular group is missing,
is obtained from `-defaultTenant.prometheus` or `-defaultTenant.graphite` depending on the `type` of the group.
The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise`
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) and in `*-enterprise`
tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
### Reading rules from object storage
@ -754,6 +754,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
@ -942,7 +947,7 @@ The shortlist of configuration flags is the following:
{% raw %}
```console
-clusterMode
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-configCheckInterval duration
Interval for checking for changes in '-rule' or '-notifier.config' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.
-datasource.appendTypePrefix
@ -980,13 +985,11 @@ The shortlist of configuration flags is the following:
-datasource.queryStep duration
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
-datasource.queryTimeAlignment
Flag is deprecated and will be removed in next releases, please use `eval_alignment` in rule group instead.
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
Deprecated: please use "eval_alignment" in rule group instead. Whether to align "time" parameter with evaluation interval. Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.showURL
Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics.
It is hidden by default, since it can contain sensitive info such as auth key.
Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
-datasource.tlsCertFile string
@ -1000,13 +1003,13 @@ The shortlist of configuration flags is the following:
-datasource.url string
Datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect URL. Required parameter. E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend and -datasource.showURL
-defaultTenant.graphite string
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-defaultTenant.prometheus string
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-disableAlertgroupLabel
Whether to disable adding group's Name as label to generated alerts and time series.
-dryRun
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP are used
-envflag.enable
@ -1014,7 +1017,7 @@ The shortlist of configuration flags is the following:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
-external.alert.source string
@ -1024,6 +1027,8 @@ The shortlist of configuration flags is the following:
Supports an array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -1055,11 +1060,11 @@ The shortlist of configuration flags is the following:
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@ -1098,6 +1103,8 @@ The shortlist of configuration flags is the following:
-notifier.bearerTokenFile array
Optional path to bearer token file for -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.blackhole -notifier.url
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). -notifier.url, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
-notifier.config string
Path to configuration file for notifiers
-notifier.oauth2.clientID array
@ -1115,6 +1122,8 @@ The shortlist of configuration flags is the following:
-notifier.oauth2.tokenUrl array
Optional OAuth2 tokenURL to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.showURL
Whether to avoid stripping sensitive information such as passwords from URL in log messages or UI for -notifier.url. It is hidden by default, since it can contain sensitive info such as auth key
-notifier.suppressDuplicateTargetErrors
Whether to suppress 'duplicate target' errors during discovery
-notifier.tlsCAFile array
@ -1135,11 +1144,6 @@ The shortlist of configuration flags is the following:
-notifier.url array
Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.
Supports an array of values separated by comma or specified via multiple flags.
-notifier.showURL bool
Whether to avoid stripping sensitive information such as passwords from URLs in log messages or UI for -notifier.url.
It is hidden by default, since it can contain sensitive info such as auth key.
-notifier.blackhole bool
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). `-notifier.url`, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
-pprofAuthKey string
Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings
-promscrape.consul.waitTime duration
@ -1303,22 +1307,18 @@ The shortlist of configuration flags is the following:
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Whether to validate annotation and label templates (default true)
-s2a_enable_appengine_dialer
If true, opportunistically use AppEngine-specific dialer to call S2A.
-s2a_timeout duration
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
-s3.configFilePath string
Path to file with S3 configs. Configs are loaded from default location if not set.
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.configProfile string
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.credsFilePath string
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.customEndpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.forcePathStyle
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default true)
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default true)
-tls
Whether to enable TLS for incoming HTTP requests at -httpListenAddr (aka https). -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
@ -1508,7 +1508,7 @@ software. Please keep simplicity as the main priority.
## How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
* `vmalert` is located in `vmutils-*` archives there.

View file

@ -0,0 +1,131 @@
package datasource
import (
"context"
"net/http"
"sync"
"time"
)
// FakeQuerier is a mock querier that return predefined results and error message
type FakeQuerier struct {
sync.Mutex
metrics []Metric
err error
}
// SetErr sets query error message
func (fq *FakeQuerier) SetErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
// Reset reset querier's error message and results
func (fq *FakeQuerier) Reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
// Add appends metrics to querier result metrics
func (fq *FakeQuerier) Add(metrics ...Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
// BuildWithParams return FakeQuerier itself
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
return fq
}
// QueryRange performs query
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return Result{}, nil, fq.err
}
cp := make([]Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithRegistry can store different results for different query expr
type FakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]Metric
}
// Set stores query result for given key
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
// Reset clean querier's results registry
func (fqr *FakeQuerierWithRegistry) Reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
// BuildWithParams returns itself
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
return fqr
}
// QueryRange performs query
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier registry
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return Result{}, req, nil
}
cp := make([]Metric, len(metrics))
copy(cp, metrics)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithDelay mock querier with given delay duration
type FakeQuerierWithDelay struct {
FakeQuerier
Delay time.Duration
}
// Query returns query result after delay duration
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
timer := time.NewTimer(fqd.Delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.FakeQuerier.Query(ctx, expr, ts)
}
// BuildWithParams returns itself
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
return fqd
}

View file

@ -47,9 +47,10 @@ var (
queryStep = flag.Duration("datasource.queryStep", 5*time.Minute, "How far a value can fallback to when evaluating queries. "+
"For example, if -datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query. "+
"If set to 0, rule's evaluation interval will be used instead.")
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, "Flag is deprecated and will be removed in next releases, please use `eval_alignment` in rule group instead."+
`Whether to align "time" parameter with evaluation interval.`+
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Deprecated: please use "eval_alignment" in rule group instead. `+
`Whether to align "time" parameter with evaluation interval. `+
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. "+
"See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
disableKeepAlive = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)
@ -110,6 +111,10 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
if err != nil {
return nil, fmt.Errorf("failed to configure auth: %w", err)
}
_, err = authCfg.GetAuthHeader()
if err != nil {
return nil, fmt.Errorf("failed to set request auth header to datasource %q: %s", *addr, err)
}
return &VMStorage{
c: &http.Client{Transport: tr},

View file

@ -137,12 +137,15 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati
// Query executes the given query and returns parsed response
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error) {
req := s.newQueryRequest(query, ts)
req, err := s.newQueryRequest(query, ts)
if err != nil {
return Result{}, nil, err
}
resp, err := s.do(ctx, req)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req = s.newQueryRequest(query, ts)
req, _ = s.newQueryRequest(query, ts)
resp, err = s.do(ctx, req)
}
if err != nil {
@ -173,12 +176,15 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
if end.IsZero() {
return res, fmt.Errorf("end param is missing")
}
req := s.newQueryRangeRequest(query, start, end)
req, err := s.newQueryRangeRequest(query, start, end)
if err != nil {
return Result{}, err
}
resp, err := s.do(ctx, req)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req = s.newQueryRangeRequest(query, start, end)
req, _ = s.newQueryRangeRequest(query, start, end)
resp, err = s.do(ctx, req)
}
if err != nil {
@ -210,14 +216,20 @@ func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response,
return resp, nil
}
func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) *http.Request {
req := s.newRequest()
func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query_range request to datasource %q: %s", s.datasourceURL, err)
}
s.setPrometheusRangeReqParams(req, query, start, end)
return req
return req, nil
}
func (s *VMStorage) newQueryRequest(query string, ts time.Time) *http.Request {
req := s.newRequest()
func (s *VMStorage) newQueryRequest(query string, ts time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query request to datasource %q: %s", s.datasourceURL, err)
}
switch s.dataSourceType {
case "", datasourcePrometheus:
s.setPrometheusInstantReqParams(req, query, ts)
@ -226,20 +238,23 @@ func (s *VMStorage) newQueryRequest(query string, ts time.Time) *http.Request {
default:
logger.Panicf("BUG: engine not found: %q", s.dataSourceType)
}
return req
return req, nil
}
func (s *VMStorage) newRequest() *http.Request {
func (s *VMStorage) newRequest() (*http.Request, error) {
req, err := http.NewRequest(http.MethodPost, s.datasourceURL, nil)
if err != nil {
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", s.datasourceURL, err)
}
req.Header.Set("Content-Type", "application/json")
if s.authCfg != nil {
s.authCfg.SetHeaders(req, true)
err = s.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
}
for _, h := range s.extraHeaders {
req.Header.Set(h.key, h.value)
}
return req
return req, nil
}

View file

@ -637,7 +637,10 @@ func TestRequestParams(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
req := tc.vm.newRequest()
req, err := tc.vm.newRequest()
if err != nil {
t.Fatal(err)
}
switch tc.vm.dataSourceType {
case "", datasourcePrometheus:
if tc.queryRange {
@ -732,7 +735,10 @@ func TestHeaders(t *testing.T) {
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
vm := tt.vmFn()
req := vm.newQueryRequest("foo", time.Now())
req, err := vm.newQueryRequest("foo", time.Now())
if err != nil {
t.Fatal(err)
}
tt.checkFn(t, req)
})
}

View file

@ -18,6 +18,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
)
@ -229,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
groups: make(map[uint64]*rule.Group),
querierBuilder: q,
notifiers: nts,
labels: labels,

View file

@ -8,11 +8,19 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
rule.SkipRandSleepOnGroupStart = true
}
func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false)
@ -98,10 +106,10 @@ groups:
ctx, cancel := context.WithCancel(context.Background())
m := &manager{
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*Group),
querierBuilder: &datasource.FakeQuerier{},
groups: make(map[uint64]*rule.Group),
labels: map[string]string{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
rw: &remotewrite.Client{},
}

View file

@ -3,14 +3,13 @@ package main
import (
"context"
"fmt"
"net/url"
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
@ -19,7 +18,7 @@ type manager struct {
querierBuilder datasource.QuerierBuilder
notifiers func() []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
// remote read builder.
rr datasource.QuerierBuilder
@ -27,28 +26,28 @@ type manager struct {
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
groups map[uint64]*rule.Group
}
// RuleAPI generates APIRule object from alert by its ID(hash)
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
// ruleAPI generates apiRule object from alert by its ID(hash)
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
if rule.ID() == rID {
return rule.ToAPI(), nil
return ruleToAPI(rule), nil
}
}
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
}
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
// alertAPI generates apiAlert object from alert by its ID(hash)
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
if !ok {
return nil, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
for _, r := range g.Rules {
ar, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
return apiAlert, nil
}
}
@ -82,15 +81,15 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error {
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
m.wg.Add(1)
id := g.ID()
go func() {
defer m.wg.Done()
if restore {
g.start(ctx, m.notifiers, m.rw, m.rr)
g.Start(ctx, m.notifiers, m.rw, m.rr)
} else {
g.start(ctx, m.notifiers, m.rw, nil)
g.Start(ctx, m.notifiers, m.rw, nil)
}
}()
m.groups[id] = g
@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
var rrPresent, arPresent bool
groupsRegistry := make(map[uint64]*Group)
groupsRegistry := make(map[uint64]*rule.Group)
for _, cfg := range groupsCfg {
for _, r := range cfg.Rules {
if rrPresent && arPresent {
@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
arPresent = true
}
}
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
}
type updateItem struct {
old *Group
new *Group
old *rule.Group
new *rule.Group
}
var toUpdate []updateItem
@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
if !ok {
// old group is not present in new list,
// so must be stopped and deleted
og.close()
og.Close()
delete(m.groups, og.ID())
og = nil
continue
@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
go func(old *rule.Group, new *rule.Group) {
old.UpdateWith(new)
wg.Done()
}(item.old, item.new)
item.old.interruptEval()
item.old.InterruptEval()
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]APIRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, r.ToAPI())
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -10,8 +10,10 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
// successful cases of
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
m := &manager{groups: make(map[uint64]*rule.Group)}
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
if err := m.update(context.Background(), cfg, false); err != nil {
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
}()
var (
VMRows = &AlertingRule{
VMRows = &rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 10 * time.Second,
@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "{{$labels}}",
},
}
Conns = &AlertingRule{
Conns = &rule.AlertingRule{
Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{
@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
}
ExampleAlertAlwaysFiring = &AlertingRule{
ExampleAlertAlwaysFiring = &rule.AlertingRule{
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
name string
initPath string
updatePath string
want []*Group
want []*rule.Group
}{
{
name: "update good rules",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
Rules: []rule.Rule{
&rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 5 * time.Minute,
@ -153,19 +155,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update good rules from 1 to 2 groups",
initPath: "config/testdata/dir/rules/rules1-good.rules",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -176,20 +179,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update with one bad rule file",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup",
Type: config.NewPrometheusType(),
Rules: []Rule{
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -200,19 +203,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -224,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
@ -255,11 +259,36 @@ func TestManagerUpdate(t *testing.T) {
})
}
}
func compareGroups(t *testing.T, a, b *rule.Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := rule.CompareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func TestManagerUpdateNegative(t *testing.T) {
testCases := []struct {
notifiers []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
cfg config.Group
expErr string
}{
@ -286,7 +315,7 @@ func TestManagerUpdateNegative(t *testing.T) {
"contains alerting rules",
},
{
[]notifier.Notifier{&fakeNotifier{}},
[]notifier.Notifier{&notifier.FakeNotifier{}},
nil,
config.Group{
Name: "Recording and alerting rules",
@ -316,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.cfg.Name, func(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
rw: tc.rw,
}
if tc.notifiers != nil {
@ -346,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
}
return cfg
}
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -88,7 +88,10 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
req = req.WithContext(ctx)
if am.authCfg != nil {
am.authCfg.SetHeaders(req, true)
err = am.authCfg.SetHeaders(req, true)
if err != nil {
return err
}
}
resp, err := am.client.Do(req)
if err != nil {

View file

@ -0,0 +1,59 @@
package notifier
import (
"context"
"fmt"
"sync"
"time"
)
// FakeNotifier is a mock notifier
type FakeNotifier struct {
sync.Mutex
alerts []Alert
// records number of received alerts in total
counter int
}
// Close does nothing
func (*FakeNotifier) Close() {}
// Addr returns ""
func (*FakeNotifier) Addr() string { return "" }
// Send sets alerts and increases counter
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
// GetCounter returns received alerts count
func (fn *FakeNotifier) GetCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
// GetAlerts returns stored alerts
func (fn *FakeNotifier) GetAlerts() []Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
// FaultyNotifier is a mock notifier that Send() will return failed response
type FaultyNotifier struct {
FakeNotifier
}
// Send returns failed response
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}

View file

@ -0,0 +1,325 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write client.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return &nonRetriableError{err: err}
}
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
}

View file

@ -0,0 +1,97 @@
package remotewrite
import (
"bytes"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// DebugClient won't push series periodically, but will write data to remote endpoint
// immediately when Push() is called
type DebugClient struct {
addr string
c *http.Client
wg sync.WaitGroup
}
// NewDebugClient initiates and returns a new DebugClient
func NewDebugClient() (*DebugClient, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &DebugClient{
c: &http.Client{
Timeout: *sendTimeout,
Transport: t,
},
addr: strings.TrimSuffix(*addr, "/"),
}
return c, nil
}
// Push sends the given timeseries to the remote storage.
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
c.wg.Add(1)
defer c.wg.Done()
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
data, err := wr.Marshal()
if err != nil {
return fmt.Errorf("failed to marshal the given time series: %w", err)
}
return c.send(data)
}
// Close stops the DebugClient
func (c *DebugClient) Close() error {
c.wg.Wait()
return nil
}
func (c *DebugClient) send(data []byte) error {
b := snappy.Encode(nil, data)
r := bytes.NewReader(b)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req)
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode/100 == 2 {
return nil
}
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}

View file

@ -0,0 +1,50 @@
package remotewrite
import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestDebugClient_Push(t *testing.T) {
testSrv := newRWServer()
oldAddr := *addr
*addr = testSrv.URL
defer func() {
*addr = oldAddr
}()
client, err := NewDebugClient()
if err != nil {
t.Fatalf("failed to create debug client: %s", err)
}
const rowsN = 100
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: float64(i),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}

View file

@ -1,322 +1,13 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
// RWClient represents an HTTP client for pushing data via remote write protocol
type RWClient interface {
// Push pushes the give time series to remote storage
Push(s prompbmarshal.TimeSeries) error
// Close stops the client. Client can't be reused after Close call.
Close() error
}

View file

@ -1,19 +1,16 @@
package main
import (
"context"
"flag"
"fmt"
"strings"
"time"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
var (
@ -33,7 +30,7 @@ var (
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
)
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
if *replayMaxDatapoints < 1 {
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
}
@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
var total int
for _, cfg := range groupsCfg {
ng := newGroup(cfg, qb, *evaluationInterval, labels)
total += ng.replay(tFrom, tTo, rw)
ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
}
logger.Infof("replay finished! Imported %d samples", total)
if rw != nil {
@ -77,99 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
}
return nil
}
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
var total int
step := g.Interval * time.Duration(*replayMaxDatapoints)
start = g.adjustReqTimestamp(start)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\neval_offset: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, g.EvalOffset, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !*disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(*replayRulesDelay)
}
return total
}
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < *replayRuleRetryAttempts; i++ {
tss, err = rule.ExecRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}

View file

@ -12,7 +12,7 @@ import (
)
type fakeReplayQuerier struct {
fakeQuerier
datasource.FakeQuerier
registry map[string]map[string]struct{}
}
@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
})
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,118 +0,0 @@
package main
import (
"context"
"errors"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// ExecRange executes the rule on the given time range.
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// ToAPI converts Rule into APIRule
ToAPI() APIRule
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []ruleStateEntry
cur int
}
type ruleStateEntry struct {
// stores last moment of time rule.Exec was called
time time.Time
// stores the timestamp rule.Exec was called with
at time.Time
// stores the duration of the last rule.Exec call
duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
err error
// stores the number of samples returned during
// the last evaluation
samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
seriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
curl string
}
func newRuleState(size int) *ruleState {
if size < 1 {
size = 1
}
return &ruleState{
entries: make([]ruleStateEntry, size),
}
}
func (s *ruleState) getLast() ruleStateEntry {
s.RLock()
defer s.RUnlock()
return s.entries[s.cur]
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getAll() []ruleStateEntry {
entries := make([]ruleStateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.time.IsZero() || !e.at.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e ruleStateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}

View file

@ -1,11 +1,10 @@
package main
package rule
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"time"
@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
seriesFetched *utils.Gauge
}
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
// NewAlertingRule creates a new AlertingRule
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
Type: group.Type,
RuleID: cfg.ID,
@ -80,10 +80,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
metrics: &alertingRuleMetrics{},
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
ar.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
ar.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
ar.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := ar.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.seriesFetched == nil {
if e.SeriesFetched == nil {
// means seriesFetched is unsupported
return -1
}
seriesFetched := float64(*e.seriesFetched)
if seriesFetched == 0 && e.samples > 0 {
seriesFetched := float64(*e.SeriesFetched)
if seriesFetched == 0 && e.Samples > 0 {
// `alert: 0.95` will fetch no series
// but will get one time series in response.
seriesFetched = float64(e.samples)
seriesFetched = float64(e.Samples)
}
return seriesFetched
})
return ar
}
// Close unregisters rule metrics
func (ar *AlertingRule) Close() {
// close unregisters rule metrics
func (ar *AlertingRule) close() {
ar.metrics.active.Unregister()
ar.metrics.pending.Unregister()
ar.metrics.errors.Unregister()
@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// GetAlerts returns active alerts of rule
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var alerts []*notifier.Alert
for _, a := range ar.alerts {
alerts = append(alerts, a)
}
return alerts
}
// GetAlert returns alert if id exists
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
if ar.alerts == nil {
return nil
}
return ar.alerts[id]
}
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
if !ar.Debug {
return
@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
logger.Infof("%s", prefix+msg)
}
// updateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) updateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
type labelSet struct {
// origin labels extracted from received time series
// plus extra labels (group labels, service labels like alertNameLabel).
@ -248,11 +294,11 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
return ls, nil
}
// ExecRange executes alerting rule on the given time range similarly to Exec.
// execRange executes alerting rule on the given time range similarly to exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil {
return nil, err
@ -297,19 +343,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute
// Exec executes AlertingRule expression via the given Querier.
// exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
err: err,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Err: err,
Curl: requestToCurl(req),
}
defer func() {
@ -323,7 +369,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
@ -342,15 +388,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
for _, m := range res.Data {
ls, err := ar.toLabels(m, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.Err
}
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.Err
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
@ -373,8 +419,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
a, err := ar.newAlert(m, ls, start, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.Err
}
a.ID = h
a.State = notifier.StatePending
@ -423,8 +469,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
if limit > 0 && numActivePending > limit {
ar.alerts = map[uint64]*notifier.Alert{}
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.Err
}
return ar.toTimeSeries(ts.Unix()), nil
}
@ -441,26 +487,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
return tss
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
// TODO: consider hashing algorithm in VM
func hash(labels map[string]string) uint64 {
hash := fnv.New64a()
@ -503,102 +529,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
return a, err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// ToAPI returns Rule representation in form of APIRule
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
// satisfy APIRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// AlertsToAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
var alerts []*APIAlert
ar.alertsMu.RLock()
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.alertsMu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
aa := &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
const (
// alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
@ -646,10 +576,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
}
// Restore restores the value of ActiveAt field for active alerts,
// restore restores the value of ActiveAt field for active alerts,
// based on previously written time series `alertForStateMetricName`.
// Only rules with For > 0 can be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
if ar.For < 1 {
return nil
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
for i, step := range tc.steps {
fq.reset()
fq.add(step...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
fq.Reset()
fq.Add(step...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
@ -482,11 +482,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.data...)
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
fq.Add(tc.data...)
gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
@ -518,24 +518,24 @@ func TestAlertingRule_ExecRange(t *testing.T) {
func TestGroup_Restore(t *testing.T) {
defaultTS := time.Now()
fqr := &fakeQuerierWithRegistry{}
fqr := &datasource.FakeQuerierWithRegistry{}
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
t.Helper()
defer fqr.reset()
defer fqr.Reset()
for _, r := range rules {
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
}
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }
fg.start(context.Background(), nts, nil, fqr)
nts := func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} }
fg.Start(context.Background(), nts, nil, fqr)
wg.Done()
}()
fg.close()
fg.Close()
wg.Wait()
gotAlerts := make(map[uint64]*notifier.Alert)
@ -582,11 +582,11 @@ func TestGroup_Restore(t *testing.T) {
ActiveAt: defaultTS,
},
})
fqr.reset()
fqr.Reset()
// one active alert with state restore
ts := time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -598,7 +598,7 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, one with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{
@ -616,9 +616,9 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
@ -636,7 +636,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert but wrong state restore
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
stateMetric("wrong alert", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -648,7 +648,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with labels
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -660,7 +660,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with restore labels missmatch
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev", "team", "foo"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -672,30 +672,30 @@ func TestGroup_Restore(t *testing.T) {
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.reset()
fq.Reset()
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.SetErr(errors.New(expErr))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@ -705,7 +705,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
}
func TestAlertingRuleLimit(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
@ -737,15 +737,15 @@ func TestAlertingRuleLimit(t *testing.T) {
err error
timestamp = time.Now()
)
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
for _, testCase := range testCases {
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit)
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
}
fq.reset()
fq.Reset()
}
func TestAlertingRule_Template(t *testing.T) {
@ -870,12 +870,12 @@ func TestAlertingRule_Template(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
tc.rule.state = newRuleState(10)
fq.add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
fq.Add(tc.metrics...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
@ -989,7 +989,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
For: waitFor,
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(10),
state: &ruleState{entries: make([]StateEntry, 10)},
}
return &rule
}

View file

@ -1,8 +1,10 @@
package main
package rule
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/fnv"
"net/url"
@ -11,7 +13,7 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@ -21,6 +23,18 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent ")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
)
// Group is an entity for grouping rules
@ -96,7 +110,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
return r
}
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
// NewGroup returns a new group
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
@ -153,11 +168,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
return g
}
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(qb, g, rule)
func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
if r.Alert != "" {
return NewAlertingRule(qb, g, r)
}
return newRecordingRule(qb, g, rule)
return NewRecordingRule(qb, g, r)
}
// ID return unique group ID that consists of
@ -178,8 +193,8 @@ func (g *Group) ID() uint64 {
return hash.Sum64()
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
// restore restores alerts state for group rules
func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
if !ok {
@ -195,7 +210,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
Headers: g.Headers,
Debug: ar.Debug,
})
if err := ar.Restore(ctx, q, ts, lookback); err != nil {
if err := ar.restore(ctx, q, ts, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
@ -205,7 +220,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
// updateWith updates existing group with
// passed group object. This function ignores group
// evaluation interval change. It supposed to be updated
// in group.start function.
// in group.Start function.
// Not thread-safe.
func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule)
@ -218,11 +233,11 @@ func (g *Group) updateWith(newGroup *Group) error {
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i].Close()
g.Rules[i].close()
g.Rules[i] = nil
continue
}
if err := or.UpdateWith(nr); err != nil {
if err := or.updateWith(nr); err != nil {
return err
}
delete(rulesRegistry, nr.ID())
@ -255,10 +270,10 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil
}
// interruptEval interrupts in-flight rules evaluations
// InterruptEval interrupts in-flight rules evaluations
// within the group. It is expected that g.evalCancel
// will be repopulated after the call.
func (g *Group) interruptEval() {
func (g *Group) InterruptEval() {
g.mu.RLock()
defer g.mu.RUnlock()
@ -267,12 +282,13 @@ func (g *Group) interruptEval() {
}
}
func (g *Group) close() {
// Close stops the group and it's rules, unregisters group metrics
func (g *Group) Close() {
if g.doneCh == nil {
return
}
close(g.doneCh)
g.interruptEval()
g.InterruptEval()
<-g.finishedCh
g.metrics.iterationDuration.Unregister()
@ -280,19 +296,21 @@ func (g *Group) close() {
g.metrics.iterationMissed.Unregister()
g.metrics.iterationInterval.Unregister()
for _, rule := range g.Rules {
rule.Close()
rule.close()
}
}
var skipRandSleepOnGroupStart bool
// SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
var SkipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
// Start starts group's evaluation
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
defer func() { close(g.finishedCh) }()
evalTS := time.Now()
// sleep random duration to spread group rules evaluation
// over time in order to reduce load on datasource.
if !skipRandSleepOnGroupStart {
if !SkipRandSleepOnGroupStart {
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
g.infof("will start in %v", sleepBeforeStart)
@ -310,10 +328,10 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
e := &executor{
rw: rw,
notifiers: nts,
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
g.infof("started")
@ -355,7 +373,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
// restore the rules state after the first evaluation
// so only active alerts can be restored.
if rr != nil {
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack)
err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
}
@ -409,6 +427,22 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
}
// UpdateWith inserts new group to updateCh
func (g *Group) UpdateWith(new *Group) {
g.updateCh <- new
}
// DeepCopy returns a deep copy of group
func (g *Group) DeepCopy() *Group {
g.mu.RLock()
data, _ := json.Marshal(g)
g.mu.RUnlock()
newG := Group{}
_ = json.Unmarshal(data, &newG)
newG.Rules = g.Rules
return &newG
}
// delayBeforeStart returns a duration on the interval between [ts..ts+interval].
// delayBeforeStart accounts for `offset`, so returned duration should be always
// bigger than the `offset`.
@ -438,6 +472,89 @@ func (g *Group) infof(format string, args ...interface{}) {
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
}
// Replay performs group replay
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
var total int
step := g.Interval * time.Duration(maxDataPoint)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(replayDelay)
}
return total
}
// ExecOnce evaluates all the rules under group for once with given timestamp.
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
e := &executor{
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
if len(g.Rules) < 1 {
return nil
}
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}
// getResolveDuration returns the duration after which firing alert
// can be considered as resolved.
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
@ -477,20 +594,22 @@ func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
return timestamp
}
// executor contains group's notify and rw configs
type executor struct {
notifiers func() []notifier.Notifier
Notifiers func() []notifier.Notifier
notifierHeaders map[string]string
rw *remotewrite.Client
Rw remotewrite.RWClient
previouslySentSeriesToRWMu sync.Mutex
// previouslySentSeriesToRW stores series sent to RW on previous iteration
// PreviouslySentSeriesToRW stores series sent to RW on previous iteration
// map[ruleID]map[ruleLabels][]prompb.Label
// where `ruleID` is ID of the Rule within a Group
// and `ruleLabels` is []prompb.Label marshalled to a string
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
PreviouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
}
// execConcurrently executes rules concurrently if concurrency>1
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
res := make(chan error, len(rules))
if concurrency == 1 {
@ -505,14 +624,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
sem := make(chan struct{}, concurrency)
go func() {
wg := sync.WaitGroup{}
for _, rule := range rules {
for _, r := range rules {
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, ts, resolveDuration, limit)
<-sem
wg.Done()
}(rule)
}(r)
}
wg.Wait()
close(res)
@ -530,10 +649,10 @@ var (
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
execTotal.Inc()
tss, err := rule.Exec(ctx, ts, limit)
tss, err := r.exec(ctx, ts, limit)
if err != nil {
if errors.Is(err, context.Canceled) {
// the context can be cancelled on graceful shutdown
@ -541,17 +660,17 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return nil
}
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
return fmt.Errorf("rule %q: failed to execute: %w", r, err)
}
if e.rw != nil {
if e.Rw != nil {
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
var lastErr error
for _, ts := range tss {
remoteWriteTotal.Inc()
if err := e.rw.Push(ts); err != nil {
if err := e.Rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err)
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
}
}
return lastErr
@ -560,13 +679,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return err
}
staleSeries := e.getStaleSeries(rule, tss, ts)
staleSeries := e.getStaleSeries(r, tss, ts)
if err := pushToRW(staleSeries); err != nil {
return err
}
}
ar, ok := rule.(*AlertingRule)
ar, ok := r.(*AlertingRule)
if !ok {
return nil
}
@ -578,11 +697,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
wg := sync.WaitGroup{}
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers() {
for _, nt := range e.Notifiers() {
wg.Add(1)
go func(nt notifier.Notifier) {
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
}
wg.Done()
}(nt)
@ -592,7 +711,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
}
// getStaledSeries checks whether there are stale series from previously sent ones.
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
for _, ts := range tss {
// convert labels to strings so we can compare with previously sent series
@ -600,11 +719,11 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
ruleLabels[key] = ts.Labels
}
rID := rule.ID()
rID := r.ID()
var staleS []prompbmarshal.TimeSeries
// check whether there are series which disappeared and need to be marked as stale
e.previouslySentSeriesToRWMu.Lock()
for key, labels := range e.previouslySentSeriesToRW[rID] {
for key, labels := range e.PreviouslySentSeriesToRW[rID] {
if _, ok := ruleLabels[key]; ok {
continue
}
@ -613,7 +732,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
staleS = append(staleS, ss)
}
// set previous series to current
e.previouslySentSeriesToRW[rID] = ruleLabels
e.PreviouslySentSeriesToRW[rID] = ruleLabels
e.previouslySentSeriesToRWMu.Unlock()
return staleS
@ -631,14 +750,14 @@ func (e *executor) purgeStaleSeries(activeRules []Rule) {
for _, rule := range activeRules {
id := rule.ID()
prev, ok := e.previouslySentSeriesToRW[id]
prev, ok := e.PreviouslySentSeriesToRW[id]
if ok {
// keep previous series for staleness detection
newPreviouslySentSeriesToRW[id] = prev
}
}
e.previouslySentSeriesToRW = nil
e.previouslySentSeriesToRW = newPreviouslySentSeriesToRW
e.PreviouslySentSeriesToRW = nil
e.PreviouslySentSeriesToRW = newPreviouslySentSeriesToRW
e.previouslySentSeriesToRWMu.Unlock()
}

View file

@ -1,17 +1,22 @@
package main
package rule
import (
"context"
"fmt"
"math"
"os"
"reflect"
"sort"
"testing"
"time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
@ -20,7 +25,15 @@ import (
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true
SkipRandSleepOnGroupStart = true
}
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
fmt.Println("failed to load template for test")
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUpdateWith(t *testing.T) {
@ -138,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"}
qb := &fakeQuerier{}
qb := &datasource.FakeQuerier{}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(qb, r))
@ -170,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got)
}
if err := compareRules(t, got, want); err != nil {
if err := CompareRules(t, got, want); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
@ -179,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
}
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
fs := &fakeQuerier{}
fn := &fakeNotifier{}
fs := &datasource.FakeQuerier{}
fn := &notifier.FakeNotifier{}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
const inst1, inst2, job = "foo", "bar", "baz"
@ -204,7 +231,7 @@ func TestGroupStart(t *testing.T) {
alert1.State = notifier.StateFiring
// add external label
alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1
// add service labels
@ -219,7 +246,7 @@ func TestGroupStart(t *testing.T) {
alert2.State = notifier.StateFiring
// add external label
alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2
// add service labels
@ -228,40 +255,40 @@ func TestGroupStart(t *testing.T) {
alert2.ID = hash(alert2.Labels)
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
fs.Add(m1)
fs.Add(m2)
go func() {
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
close(finished)
}()
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts()
gotAlerts := fn.GetAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
gotAlertsNum := fn.getCounter()
gotAlertsNum := fn.GetCounter()
if gotAlertsNum < len(expectedAlerts)*2 {
t.Fatalf("expected to receive at least %d alerts; got %d instead",
len(expectedAlerts)*2, gotAlertsNum)
}
// reset previous data
fs.reset()
fs.Reset()
// and set only one datapoint for response
fs.add(m1)
fs.Add(m1)
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts()
gotAlerts = fn.GetAlerts()
alert2.State = notifier.StateInactive
expectedAlerts = []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
g.close()
g.Close()
<-finished
}
@ -294,15 +321,15 @@ func TestResolveDuration(t *testing.T) {
func TestGetStaleSeries(t *testing.T) {
ts := time.Now()
e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
t.Helper()
var tss []prompbmarshal.TimeSeries
for _, l := range labels {
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
}
staleS := e.getStaleSeries(rule, tss, ts)
staleS := e.getStaleSeries(r, tss, ts)
if staleS == nil && expLabels == nil {
return
}
@ -387,7 +414,7 @@ func TestPurgeStaleSeries(t *testing.T) {
f := func(curRules, newRules, expStaleRules []Rule) {
t.Helper()
e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
// seed executor with series for
// current rules
@ -397,13 +424,13 @@ func TestPurgeStaleSeries(t *testing.T) {
e.purgeStaleSeries(newRules)
if len(e.previouslySentSeriesToRW) != len(expStaleRules) {
if len(e.PreviouslySentSeriesToRW) != len(expStaleRules) {
t.Fatalf("expected to get %d stale series, got %d",
len(expStaleRules), len(e.previouslySentSeriesToRW))
len(expStaleRules), len(e.PreviouslySentSeriesToRW))
}
for _, exp := range expStaleRules {
if _, ok := e.previouslySentSeriesToRW[exp.ID()]; !ok {
if _, ok := e.PreviouslySentSeriesToRW[exp.ID()]; !ok {
t.Fatalf("expected to have rule %d; got nil instead", exp.ID())
}
}
@ -438,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
}
func TestFaultyNotifier(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := newTestAlertingRule("instant", 0)
r.q = fq
fn := &fakeNotifier{}
fn := &notifier.FakeNotifier{}
e := &executor{
notifiers: func() []notifier.Notifier {
Notifiers: func() []notifier.Notifier {
return []notifier.Notifier{
&faultyNotifier{},
&notifier.FaultyNotifier{},
fn,
}
},
@ -464,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
tn := time.Now()
deadline := tn.Add(delay / 2)
for {
if fn.getCounter() > 0 {
if fn.GetCounter() > 0 {
return
}
if tn.After(deadline) {
@ -477,18 +504,18 @@ func TestFaultyNotifier(t *testing.T) {
}
func TestFaultyRW(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := &RecordingRule{
Name: "test",
state: newRuleState(10),
q: fq,
state: &ruleState{entries: make([]StateEntry, 10)},
}
e := &executor{
rw: &remotewrite.Client{},
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
Rw: &remotewrite.Client{},
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
err := e.exec(context.Background(), r, time.Now(), 0, 10)
@ -498,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
}
func TestCloseWithEvalInterruption(t *testing.T) {
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const delay = time.Second * 2
fq := &fakeQuerierWithDelay{delay: delay}
fq := &datasource.FakeQuerierWithDelay{Delay: delay}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fq, evalInterval, nil)
g := NewGroup(groups[0], fq, evalInterval, nil)
go g.start(context.Background(), nil, nil, nil)
go g.Start(context.Background(), nil, nil, nil)
time.Sleep(evalInterval * 20)
go func() {
g.close()
g.Close()
}()
deadline := time.Tick(delay / 2)
@ -637,3 +679,81 @@ func TestGetPrometheusReqTimestamp(t *testing.T) {
}
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
// NewRecordingRule creates a new RecordingRule
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
Type: group.Type,
RuleID: cfg.ID,
@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
}),
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
rr.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
rr.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
rr.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
e := rr.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := rr.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
return rr
}
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
// close unregisters rule metrics
func (rr *RecordingRule) close() {
rr.metrics.errors.Unregister()
rr.metrics.samples.Unregister()
}
// ExecRange executes recording rule on the given time range similarly to Exec.
// execRange executes recording rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil {
return nil, err
@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
return tss, nil
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
// exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Curl: requestToCurl(req),
}
defer func() {
@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
}()
if err != nil {
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.Err
}
qMetrics := res.Data
numSeries := len(qMetrics)
if limit > 0 && numSeries > limit {
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.Err
}
duplicates := make(map[string]struct{}, len(qMetrics))
@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
ts := rr.toTimeSeries(r)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.Err
}
duplicates[key] = struct{}{}
tss = append(tss, ts)
@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
return newTimeSeries(m.Values, m.Timestamps, labels)
}
// UpdateWith copies all significant fields.
func (rr *RecordingRule) UpdateWith(r Rule) error {
// updateWith copies all significant fields.
func (rr *RecordingRule) updateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
rr.q = nr.q
return nil
}
// ToAPI returns Rule's representation in form
// of APIRule
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
return r
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
Name: "job:foo",
Labels: map[string]string{
"source": "test",
}},
},
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tc.rule.state = newRuleState(10)
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0)
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
}
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
"source": "test_limit",
}}
var err error
for _, testCase := range testCases {
fq := &fakeQuerier{}
fq.add(testMetrics...)
fq := &datasource.FakeQuerier{}
fq.Add(testMetrics...)
rule.q = fq
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit)
_, err = rule.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
@ -215,18 +218,17 @@ func TestRecordingRuleLimit(t *testing.T) {
func TestRecordingRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{
Name: "job:foo",
state: newRuleState(10),
Name: "job:foo",
Labels: map[string]string{
"job": "test",
},
state: &ruleState{entries: make([]StateEntry, 10)},
}
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
fq.SetErr(errors.New(expErr))
rr.q = fq
_, err := rr.Exec(context.TODO(), time.Now(), 0)
_, err := rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
fq.reset()
fq.Reset()
// add metrics which differs only by `job` label
// which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), time.Now(), 0)
_, err = rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}

174
app/vmalert/rule/rule.go Normal file
View file

@ -0,0 +1,174 @@
package rule
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// execRange executes the rule on the given time range.
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// updateWith performs modification of current Rule
// with fields of the given Rule.
updateWith(Rule) error
// close performs the shutdown procedures for rule
// such as metrics unregister
close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []StateEntry
cur int
}
// StateEntry stores rule's execution states
type StateEntry struct {
// stores last moment of time rule.Exec was called
Time time.Time
// stores the timesteamp with which rule.Exec was called
At time.Time
// stores the duration of the last rule.Exec call
Duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
Err error
// stores the number of samples returned during
// the last evaluation
Samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
SeriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
Curl string
}
// GetLastEntry returns latest stateEntry of rule
func GetLastEntry(r Rule) StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getLast()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getLast()
}
return StateEntry{}
}
// GetRuleStateSize returns size of rule stateEntry
func GetRuleStateSize(r Rule) int {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.size()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.size()
}
return 0
}
// GetAllRuleState returns rule entire stateEntries
func GetAllRuleState(r Rule) []StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getAll()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getAll()
}
return []StateEntry{}
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getLast() StateEntry {
s.RLock()
defer s.RUnlock()
if len(s.entries) == 0 {
return StateEntry{}
}
return s.entries[s.cur]
}
func (s *ruleState) getAll() []StateEntry {
entries := make([]StateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.Time.IsZero() || !e.At.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e StateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < replayRuleRetryAttempts; i++ {
tss, err = r.execRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}

View file

@ -0,0 +1,81 @@
package rule
import (
"sync"
"testing"
"time"
)
func TestRule_state(t *testing.T) {
stateEntriesN := 20
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, stateEntriesN)}}
e := r.state.getLast()
if !e.At.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
r.state.add(StateEntry{At: now})
e = r.state.getLast()
if e.At != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
r.state.add(StateEntry{At: now2})
e = r.state.getLast()
if e.At != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now2)
}
if len(r.state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(r.state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
r.state.add(StateEntry{At: last})
}
e = r.state.getLast()
if e.At != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, last)
}
if len(r.state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(r.state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, 20)}}
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
r.state.add(StateEntry{At: time.Now()})
r.state.getAll()
r.state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -1,239 +1,18 @@
package main
package rule
import (
"context"
"fmt"
"net/http"
"reflect"
"sort"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
err error
}
func (fq *fakeQuerier) setErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fq
}
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) (datasource.Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return datasource.Result{}, nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]datasource.Metric
}
func (fqr *fakeQuerierWithRegistry) set(key string, metrics ...datasource.Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]datasource.Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqr
}
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (datasource.Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return datasource.Result{}, req, nil
}
cp := make([]datasource.Metric, len(metrics))
copy(cp, metrics)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithDelay struct {
fakeQuerier
delay time.Duration
}
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (datasource.Result, *http.Request, error) {
timer := time.NewTimer(fqd.delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.fakeQuerier.Query(ctx, expr, ts)
}
func (fqd *fakeQuerierWithDelay) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqd
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
// records number of received alerts in total
counter int
}
func (*fakeNotifier) Close() {}
func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
type faultyNotifier struct {
fakeNotifier
}
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := compareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func compareRules(t *testing.T, a, b Rule) error {
// CompareRules is a test helper func for other tests
func CompareRules(t *testing.T, a, b Rule) error {
t.Helper()
switch v := a.(type) {
case *AlertingRule:
@ -287,6 +66,50 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
return nil
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
t.Helper()
if len(a) != len(b) {

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"fmt"

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"net/http"

View file

@ -1,100 +0,0 @@
package main
import (
"sync"
"testing"
"time"
)
func TestRule_stateDisabled(t *testing.T) {
state := newRuleState(-1)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
if len(state.getAll()) != 1 {
// state should store at least one update at any circumstances
t.Fatalf("expected for state to have %d entries; got %d",
1, len(state.getAll()),
)
}
}
func TestRule_state(t *testing.T) {
stateEntriesN := 20
state := newRuleState(stateEntriesN)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
state.add(ruleStateEntry{at: now})
e = state.getLast()
if e.at != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
state.add(ruleStateEntry{at: now2})
e = state.getLast()
if e.at != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now2)
}
if len(state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
state.add(ruleStateEntry{at: last})
}
e = state.getLast()
if e.at != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, last)
}
if len(state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
state := newRuleState(20)
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
state.add(ruleStateEntry{at: time.Now()})
state.getAll()
state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -10,6 +10,7 @@ import (
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@ -143,38 +144,32 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
}
}
const (
paramGroupID = "group_id"
paramAlertID = "alert_id"
paramRuleID = "rule_id"
)
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
func (rh *requestHandler) getRule(r *http.Request) (apiRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
}
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 64)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
}
rule, err := rh.m.RuleAPI(groupID, ruleID)
obj, err := rh.m.ruleAPI(groupID, ruleID)
if err != nil {
return APIRule{}, errResponse(err, http.StatusNotFound)
return apiRule{}, errResponse(err, http.StatusNotFound)
}
return rule, nil
return obj, nil
}
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
func (rh *requestHandler) getAlert(r *http.Request) (*apiAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
}
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 0)
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramAlertID, err)
}
a, err := rh.m.AlertAPI(groupID, alertID)
a, err := rh.m.alertAPI(groupID, alertID)
if err != nil {
return nil, errResponse(err, http.StatusNotFound)
}
@ -184,17 +179,17 @@ func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
type listGroupsResponse struct {
Status string `json:"status"`
Data struct {
Groups []APIGroup `json:"groups"`
Groups []apiGroup `json:"groups"`
} `json:"data"`
}
func (rh *requestHandler) groups() []APIGroup {
func (rh *requestHandler) groups() []apiGroup {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
groups := make([]APIGroup, 0)
groups := make([]apiGroup, 0)
for _, g := range rh.m.groups {
groups = append(groups, g.toAPI())
groups = append(groups, groupToAPI(g))
}
// sort list of alerts for deterministic output
@ -221,35 +216,35 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
type listAlertsResponse struct {
Status string `json:"status"`
Data struct {
Alerts []*APIAlert `json:"alerts"`
Alerts []*apiAlert `json:"alerts"`
} `json:"data"`
}
func (rh *requestHandler) groupAlerts() []GroupAlerts {
func (rh *requestHandler) groupAlerts() []groupAlerts {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
var groupAlerts []GroupAlerts
var gAlerts []groupAlerts
for _, g := range rh.m.groups {
var alerts []*APIAlert
var alerts []*apiAlert
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)
a, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
alerts = append(alerts, a.AlertsToAPI()...)
alerts = append(alerts, ruleToAPIAlert(a)...)
}
if len(alerts) > 0 {
groupAlerts = append(groupAlerts, GroupAlerts{
Group: g.toAPI(),
gAlerts = append(gAlerts, groupAlerts{
Group: groupToAPI(g),
Alerts: alerts,
})
}
}
sort.Slice(groupAlerts, func(i, j int) bool {
return groupAlerts[i].Group.Name < groupAlerts[j].Group.Name
sort.Slice(gAlerts, func(i, j int) bool {
return gAlerts[i].Group.Name < gAlerts[j].Group.Name
})
return groupAlerts
return gAlerts
}
func (rh *requestHandler) listAlerts() ([]byte, error) {
@ -257,14 +252,14 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
lr.Data.Alerts = make([]*APIAlert, 0)
lr.Data.Alerts = make([]*apiAlert, 0)
for _, g := range rh.m.groups {
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)
a, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsToAPI()...)
lr.Data.Alerts = append(lr.Data.Alerts, ruleToAPIAlert(a)...)
}
}

View file

@ -38,7 +38,7 @@ btn-primary
{% endif %}
{% endfunc %}
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
{% func ListGroups(r *http.Request, originGroups []apiGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
{%code
@ -46,9 +46,9 @@ btn-primary
rOk := make(map[string]int)
rNotOk := make(map[string]int)
rNoMatch := make(map[string]int)
var groups []APIGroup
var groups []apiGroup
for _, g := range originGroups {
var rules []APIRule
var rules []apiRule
for _, r := range g.Rules {
if r.LastError != "" {
rNotOk[g.ID]++
@ -166,7 +166,7 @@ btn-primary
{% endfunc %}
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
{% func ListAlerts(r *http.Request, groupAlerts []groupAlerts) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
{% if len(groupAlerts) > 0 %}
@ -183,7 +183,7 @@ btn-primary
</div>
{%code
var keys []string
alertsByRule := make(map[string][]*APIAlert)
alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID)
@ -310,7 +310,7 @@ btn-primary
{% endfunc %}
{% func Alert(r *http.Request, alert *APIAlert) %}
{% func Alert(r *http.Request, alert *apiAlert) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
@ -397,7 +397,7 @@ btn-primary
{% endfunc %}
{% func RuleDetails(r *http.Request, rule APIRule) %}
{% func RuleDetails(r *http.Request, rule apiRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
@ -416,9 +416,9 @@ btn-primary
var seriesFetchedEnabled bool
var seriesFetchedWarning bool
for _, u := range rule.Updates {
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0{
if *u.SeriesFetched == 0 && u.Samples == 0{
seriesFetchedWarning = true
}
}
@ -537,23 +537,23 @@ btn-primary
<tbody>
{% for _, u := range rule.Updates %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.Time.Format(time.RFC3339) %}</span>
</td>
<td class="text-center">{%d u.samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.seriesFetched != nil %}{%d *u.seriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
<td class="text-center">{%d u.Samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.SeriesFetched != nil %}{%d *u.SeriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.Duration.Seconds() %}s</td>
<td class="text-center">{%s u.At.Format(time.RFC3339) %}</td>
<td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.curl %}</textarea>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.Curl %}</textarea>
</td>
</tr>
</li>
{% if u.err != nil %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
{% if u.Err != nil %}
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}">
<span class="alert-danger">{%v u.err %}</span>
<span class="alert-danger">{%v u.Err %}</span>
</td>
</tr>
{% endif %}
@ -582,7 +582,7 @@ btn-primary
<span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span>
{% endfunc %}
{% func seriesFetchedWarn(r APIRule) %}
{% func seriesFetchedWarn(r apiRule) %}
{% if isNoMatch(r) %}
<svg xmlns="http://www.w3.org/2000/svg"
data-bs-toggle="tooltip"
@ -596,7 +596,7 @@ btn-primary
{% endfunc %}
{%code
func isNoMatch (r APIRule) bool {
func isNoMatch (r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
}
%}

View file

@ -196,7 +196,7 @@ func buttonActive(filter, expValue string) string {
}
//line app/vmalert/web.qtpl:41
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []APIGroup) {
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:41
qw422016.N().S(`
`)
@ -216,9 +216,9 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
rOk := make(map[string]int)
rNotOk := make(map[string]int)
rNoMatch := make(map[string]int)
var groups []APIGroup
var groups []apiGroup
for _, g := range originGroups {
var rules []APIRule
var rules []apiRule
for _, r := range g.Rules {
if r.LastError != "" {
rNotOk[g.ID]++
@ -610,7 +610,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
}
//line app/vmalert/web.qtpl:166
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []APIGroup) {
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:166
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:166
@ -621,7 +621,7 @@ func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups [
}
//line app/vmalert/web.qtpl:166
func ListGroups(r *http.Request, originGroups []APIGroup) string {
func ListGroups(r *http.Request, originGroups []apiGroup) string {
//line app/vmalert/web.qtpl:166
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:166
@ -636,7 +636,7 @@ func ListGroups(r *http.Request, originGroups []APIGroup) string {
}
//line app/vmalert/web.qtpl:169
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:169
qw422016.N().S(`
`)
@ -712,7 +712,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
`)
//line app/vmalert/web.qtpl:185
var keys []string
alertsByRule := make(map[string][]*APIAlert)
alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID)
@ -891,7 +891,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
}
//line app/vmalert/web.qtpl:255
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:255
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:255
@ -902,7 +902,7 @@ func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []
}
//line app/vmalert/web.qtpl:255
func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) string {
func ListAlerts(r *http.Request, groupAlerts []groupAlerts) string {
//line app/vmalert/web.qtpl:255
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:255
@ -1091,7 +1091,7 @@ func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Tar
}
//line app/vmalert/web.qtpl:313
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:313
qw422016.N().S(`
`)
@ -1274,7 +1274,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
}
//line app/vmalert/web.qtpl:397
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:397
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:397
@ -1285,7 +1285,7 @@ func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
}
//line app/vmalert/web.qtpl:397
func Alert(r *http.Request, alert *APIAlert) string {
func Alert(r *http.Request, alert *apiAlert) string {
//line app/vmalert/web.qtpl:397
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:397
@ -1300,7 +1300,7 @@ func Alert(r *http.Request, alert *APIAlert) string {
}
//line app/vmalert/web.qtpl:400
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule) {
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:400
qw422016.N().S(`
`)
@ -1331,9 +1331,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
var seriesFetchedEnabled bool
var seriesFetchedWarning bool
for _, u := range rule.Updates {
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0 {
if *u.SeriesFetched == 0 && u.Samples == 0 {
seriesFetchedWarning = true
}
}
@ -1587,7 +1587,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`
<tr`)
//line app/vmalert/web.qtpl:540
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:540
qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:540
@ -1597,13 +1597,13 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">`)
//line app/vmalert/web.qtpl:542
qw422016.E().S(u.time.Format(time.RFC3339))
qw422016.E().S(u.Time.Format(time.RFC3339))
//line app/vmalert/web.qtpl:542
qw422016.N().S(`</span>
</td>
<td class="text-center">`)
//line app/vmalert/web.qtpl:544
qw422016.N().D(u.samples)
qw422016.N().D(u.Samples)
//line app/vmalert/web.qtpl:544
qw422016.N().S(`</td>
`)
@ -1612,9 +1612,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
//line app/vmalert/web.qtpl:545
qw422016.N().S(`<td class="text-center">`)
//line app/vmalert/web.qtpl:545
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
//line app/vmalert/web.qtpl:545
qw422016.N().D(*u.seriesFetched)
qw422016.N().D(*u.SeriesFetched)
//line app/vmalert/web.qtpl:545
}
//line app/vmalert/web.qtpl:545
@ -1625,18 +1625,18 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`
<td class="text-center">`)
//line app/vmalert/web.qtpl:546
qw422016.N().FPrec(u.duration.Seconds(), 3)
qw422016.N().FPrec(u.Duration.Seconds(), 3)
//line app/vmalert/web.qtpl:546
qw422016.N().S(`s</td>
<td class="text-center">`)
//line app/vmalert/web.qtpl:547
qw422016.E().S(u.at.Format(time.RFC3339))
qw422016.E().S(u.At.Format(time.RFC3339))
//line app/vmalert/web.qtpl:547
qw422016.N().S(`</td>
<td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">`)
//line app/vmalert/web.qtpl:549
qw422016.E().S(u.curl)
qw422016.E().S(u.Curl)
//line app/vmalert/web.qtpl:549
qw422016.N().S(`</textarea>
</td>
@ -1644,12 +1644,12 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
</li>
`)
//line app/vmalert/web.qtpl:553
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:553
qw422016.N().S(`
<tr`)
//line app/vmalert/web.qtpl:554
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:554
qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:554
@ -1671,7 +1671,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`">
<span class="alert-danger">`)
//line app/vmalert/web.qtpl:556
qw422016.E().V(u.err)
qw422016.E().V(u.Err)
//line app/vmalert/web.qtpl:556
qw422016.N().S(`</span>
</td>
@ -1697,7 +1697,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
}
//line app/vmalert/web.qtpl:563
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule) {
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:563
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:563
@ -1708,7 +1708,7 @@ func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule)
}
//line app/vmalert/web.qtpl:563
func RuleDetails(r *http.Request, rule APIRule) string {
func RuleDetails(r *http.Request, rule apiRule) string {
//line app/vmalert/web.qtpl:563
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:563
@ -1853,7 +1853,7 @@ func badgeStabilizing() string {
}
//line app/vmalert/web.qtpl:585
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:585
qw422016.N().S(`
`)
@ -1879,7 +1879,7 @@ func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
}
//line app/vmalert/web.qtpl:596
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:596
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:596
@ -1890,7 +1890,7 @@ func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
}
//line app/vmalert/web.qtpl:596
func seriesFetchedWarn(r APIRule) string {
func seriesFetchedWarn(r apiRule) string {
//line app/vmalert/web.qtpl:596
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:596
@ -1905,6 +1905,6 @@ func seriesFetchedWarn(r APIRule) string {
}
//line app/vmalert/web.qtpl:599
func isNoMatch(r APIRule) bool {
func isNoMatch(r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
}

View file

@ -1,6 +1,7 @@
package main
import (
"context"
"encoding/json"
"fmt"
"net/http"
@ -9,32 +10,29 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
)
func TestHandler(t *testing.T) {
ar := &AlertingRule{
Name: "alert",
alerts: map[uint64]*notifier.Alert{
0: {State: notifier.StateFiring},
},
state: newRuleState(10),
}
ar.state.add(ruleStateEntry{
time: time.Now(),
at: time.Now(),
samples: 10,
fq := &datasource.FakeQuerier{}
fq.Add(datasource.Metric{
Values: []float64{1}, Timestamps: []int64{0},
})
rr := &RecordingRule{
Name: "record",
state: newRuleState(10),
g := &rule.Group{
Name: "group",
Concurrency: 1,
}
g := &Group{
Name: "group",
Rules: []Rule{ar, rr},
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
ar := rule.NewAlertingRule(fq, g, config.Rule{ID: 0, Alert: "alert"})
rr := rule.NewRecordingRule(fq, g, config.Rule{ID: 1, Record: "record"})
g.Rules = []rule.Rule{ar, rr}
g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, nil, time.Time{})
m := &manager{groups: map[uint64]*rule.Group{
g.ID(): g,
}}
rh := &requestHandler{m: m}
getResp := func(url string, to interface{}, code int) {
@ -70,13 +68,13 @@ func TestHandler(t *testing.T) {
})
t.Run("/vmalert/rule", func(t *testing.T) {
a := ar.ToAPI()
a := ruleToAPI(ar)
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
r := rr.ToAPI()
r := ruleToAPI(rr)
getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200)
})
t.Run("/vmalert/alert", func(t *testing.T) {
alerts := ar.AlertsToAPI()
alerts := ruleToAPIAlert(ar)
for _, a := range alerts {
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
}
@ -103,14 +101,14 @@ func TestHandler(t *testing.T) {
}
})
t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) {
expAlert := ar.newAlertAPI(*ar.alerts[0])
alert := &APIAlert{}
expAlert := newAlertAPI(ar, ar.GetAlerts()[0])
alert := &apiAlert{}
getResp(ts.URL+"/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}
alert = &APIAlert{}
alert = &apiAlert{}
getResp(ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
@ -148,7 +146,7 @@ func TestHandler(t *testing.T) {
}
func TestEmptyResponse(t *testing.T) {
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*Group)}}
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*rule.Group)}}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) }))
defer ts.Close()
@ -201,7 +199,7 @@ func TestEmptyResponse(t *testing.T) {
}
})
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*Group{0: {Name: "test"}}}}
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*rule.Group{0: {Name: "test"}}}}
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) })
t.Run("empty group /api/v1/rules", func(t *testing.T) {

View file

@ -2,13 +2,28 @@ package main
import (
"fmt"
"net/url"
"sort"
"strconv"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
)
// APIAlert represents a notifier.AlertingRule state
const (
// ParamGroupID is group id key in url parameter
paramGroupID = "group_id"
// ParamAlertID is alert id key in url parameter
paramAlertID = "alert_id"
// ParamRuleID is rule id key in url parameter
paramRuleID = "rule_id"
)
// apiAlert represents a notifier.AlertingRule state
// for WEB view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIAlert struct {
type apiAlert struct {
State string `json:"state"`
Name string `json:"name"`
Value string `json:"value"`
@ -38,24 +53,24 @@ type APIAlert struct {
}
// WebLink returns a link to the alert which can be used in UI.
func (aa *APIAlert) WebLink() string {
func (aa *apiAlert) WebLink() string {
return fmt.Sprintf("alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
}
// APILink returns a link to the alert's JSON representation.
func (aa *APIAlert) APILink() string {
func (aa *apiAlert) APILink() string {
return fmt.Sprintf("api/v1/alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
}
// APIGroup represents Group for WEB view
// apiGroup represents Group for web view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIGroup struct {
type apiGroup struct {
// Name is the group name as present in the config
Name string `json:"name"`
// Rules contains both recording and alerting rules
Rules []APIRule `json:"rules"`
Rules []apiRule `json:"rules"`
// Interval is the Group's evaluation interval in float seconds as present in the file.
Interval float64 `json:"interval"`
// LastEvaluation is the timestamp of the last time the Group was executed
@ -81,15 +96,15 @@ type APIGroup struct {
Labels map[string]string `json:"labels,omitempty"`
}
// GroupAlerts represents a group of alerts for WEB view
type GroupAlerts struct {
Group APIGroup
Alerts []*APIAlert
// groupAlerts represents a group of alerts for WEB view
type groupAlerts struct {
Group apiGroup
Alerts []*apiAlert
}
// APIRule represents a Rule for WEB view
// apiRule represents a Rule for web view
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIRule struct {
type apiRule struct {
// State must be one of these under following scenarios
// "pending": at least 1 alert in the rule in pending state and no other alert in firing ruleState.
// "firing": at least 1 alert in the rule in firing state.
@ -111,7 +126,7 @@ type APIRule struct {
// LastEvaluation is the timestamp of the last time the rule was executed
LastEvaluation time.Time `json:"lastEvaluation"`
// Alerts is the list of all the alerts in this rule that are currently pending or firing
Alerts []*APIAlert `json:"alerts,omitempty"`
Alerts []*apiAlert `json:"alerts,omitempty"`
// Health is the health of rule evaluation.
// It MUST be one of "ok", "err", "unknown"
Health string `json:"health"`
@ -138,11 +153,206 @@ type APIRule struct {
// MaxUpdates is the max number of recorded ruleStateEntry objects
MaxUpdates int `json:"max_updates_entries"`
// Updates contains the ordered list of recorded ruleStateEntry objects
Updates []ruleStateEntry `json:"-"`
Updates []rule.StateEntry `json:"-"`
}
// WebLink returns a link to the alert which can be used in UI.
func (ar APIRule) WebLink() string {
func (ar apiRule) WebLink() string {
return fmt.Sprintf("rule?%s=%s&%s=%s",
paramGroupID, ar.GroupID, paramRuleID, ar.ID)
}
func ruleToAPI(r interface{}) apiRule {
if ar, ok := r.(*rule.AlertingRule); ok {
return alertingToAPI(ar)
}
if rr, ok := r.(*rule.RecordingRule); ok {
return recordingToAPI(rr)
}
return apiRule{}
}
func recordingToAPI(rr *rule.RecordingRule) apiRule {
lastState := rule.GetLastEntry(rr)
r := apiRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(rr),
Updates: rule.GetAllRuleState(rr),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
return r
}
// alertingToAPI returns Rule representation in form of apiRule
func alertingToAPI(ar *rule.AlertingRule) apiRule {
lastState := rule.GetLastEntry(ar)
r := apiRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ruleToAPIAlert(ar),
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(ar),
Updates: rule.GetAllRuleState(ar),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
// satisfy apiRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// ruleToAPIAlert generates list of apiAlert objects from existing alerts
func ruleToAPIAlert(ar *rule.AlertingRule) []*apiAlert {
var alerts []*apiAlert
for _, a := range ar.GetAlerts() {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, newAlertAPI(ar, a))
}
return alerts
}
// alertToAPI generates apiAlert object from alert by its id(hash)
func alertToAPI(ar *rule.AlertingRule, id uint64) *apiAlert {
a := ar.GetAlert(id)
if a == nil {
return nil
}
return newAlertAPI(ar, a)
}
// NewAlertAPI creates apiAlert for notifier.Alert
func newAlertAPI(ar *rule.AlertingRule, a *notifier.Alert) *apiAlert {
aa := &apiAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(*a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
func groupToAPI(g *rule.Group) apiGroup {
g = g.DeepCopy()
ag := apiGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]apiRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, ruleToAPI(r))
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -0,0 +1,23 @@
package main
import (
"testing"
)
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -7,7 +7,7 @@ The `-auth.config` can point to either local file or to http url.
## Quick start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest), unpack it
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
```console
@ -317,7 +317,7 @@ metric without label (if `unauthorized_user` section of config is used).
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - `vmauth` is located in `vmutils-*` archives there.
### Development build
@ -395,9 +395,11 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-failTimeout duration
Sets a delay period for load balancing to skip a malfunctioning backend. (defaults 3s)
Sets a delay period for load balancing to skip a malfunctioning backend (default 3s)
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -429,11 +431,11 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-logInvalidAuthTokens
Whether to log requests with invalid auth tokens. Such requests are always counted at vmauth_http_request_errors_total{reason="invalid_auth_token"} metric, which is exposed at /metrics page
-loggerDisableTimestamps

View file

@ -267,10 +267,11 @@ You have to add a custom url endpoint via flag:
### Permanent deletion of objects in S3 and compatible storages
By default, when using S3 compatible storages, `vmbackup` and `vmbackupmanager` will use the basic delete operation,
which will delete current version of the object only.
In order to enforce removing all versions of an object when object is deleted, you need to use `-deleteAllObjectVersions` flag.
Using this flag will enforce listing all versions of an object and deleting them one by one.
`vmbackup` and [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html) use standard delete operation
for S3-compatible object storage when pefrorming [incremental backups](#incremental-backups).
This operation removes only the current version of the object. This works OK in most cases.
Sometimes it is needed to remove all the versions of an object. In this case pass `-deleteAllObjectVersions` command-line flag to `vmbackup`.
Alternatively, it is possible to use object storage lifecycle rules to remove non-current versions of objects automatically.
Refer to the respective documentation for your object storage provider for more details.
@ -304,7 +305,7 @@ Run `vmbackup -help` in order to see all the available options:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
@ -336,11 +337,11 @@ Run `vmbackup -help` in order to see all the available options:
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@ -379,15 +380,11 @@ Run `vmbackup -help` in order to see all the available options:
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-s2a_enable_appengine_dialer
If true, opportunistically use AppEngine-specific dialer to call S2A.
-s2a_timeout duration
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)
-s3StorageClass string
The Storage Class applied to objects uploaded to AWS S3. Supported values are: GLACIER, DEEP_ARCHIVE, GLACIER_IR, INTELLIGENT_TIERING, ONEZONE_IA, OUTPOSTS, REDUCED_REDUNDANCY, STANDARD, STANDARD_IA.
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html/
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set
-snapshot.deleteURL string
@ -413,7 +410,7 @@ Run `vmbackup -help` in order to see all the available options:
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - see `vmutils-*` archives there.
### Development build

View file

@ -1,7 +1,7 @@
# vmbackupmanager
***vmbackupmanager is a part of [enterprise package](https://docs.victoriametrics.com/enterprise.html).
It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).***
The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**.
@ -439,7 +439,9 @@ command-line flags:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -477,11 +479,11 @@ command-line flags:
-keepLastWeekly int
Keep last N weekly backups. If 0 is specified next retention cycle removes all backups for given time period. (default -1)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@ -519,15 +521,11 @@ command-line flags:
Supports an array of values separated by comma or specified via multiple flags.
-runOnStart
Upload backups immediately after start of the service. Otherwise the backup starts on new hour
-s2a_enable_appengine_dialer
If true, opportunistically use AppEngine-specific dialer to call S2A.
-s2a_timeout duration
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)
-s3StorageClass string
The Storage Class applied to objects uploaded to AWS S3. Supported values are: GLACIER, DEEP_ARCHIVE, GLACIER_IR, INTELLIGENT_TIERING, ONEZONE_IA, OUTPOSTS, REDUCED_REDUNDANCY, STANDARD, STANDARD_IA.
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html/
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup.Example: http://victoriametrics:8428/snapshot/create
-snapshot.deleteURL string

View file

@ -1064,7 +1064,7 @@ as a proxy between `vmctl` and destination with `-remoteWrite.rateLimit` flag en
## How to build
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmctl` is located in `vmutils-*` archives there.
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - `vmctl` is located in `vmutils-*` archives there.
### Development build

View file

@ -1,7 +1,7 @@
# vmgateway
***vmgateway is a part of [enterprise package](https://docs.victoriametrics.com/enterprise.html).
It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).***
<img alt="vmgateway" src="vmgateway-overview.jpeg">
@ -293,6 +293,8 @@ The shortlist of configuration flags include the following:
Optional path to bearer token file to use for -datasource.url.
-datasource.disableKeepAlive
Whether to disable long-lived connections to the datasource. If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.
-datasource.disableStepParam
Whether to disable adding 'step' param to the issued instant queries. This might be useful when using vmalert with datasources that do not support 'step' param for instant queries, like Google Managed Prometheus. It is not recommended to enable this flag if you use vmalert with VictoriaMetrics.
-datasource.headers string
Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. For example, -datasource.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -datasource.url. Multiple headers must be delimited by '^^': -datasource.headers='header1:value1^^header2:value2'
-datasource.lookback duration
@ -312,11 +314,11 @@ The shortlist of configuration flags include the following:
-datasource.queryStep duration
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
-datasource.queryTimeAlignment
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
Deprecated: please use "eval_alignment" in rule group instead. Whether to align "time" parameter with evaluation interval. Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.showURL
Whether to show -datasource.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
-datasource.tlsCertFile string
@ -340,7 +342,9 @@ The shortlist of configuration flags include the following:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -372,11 +376,11 @@ The shortlist of configuration flags include the following:
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int

View file

@ -33,7 +33,7 @@ var (
)
var (
saCfgReloaderStopCh = make(chan struct{})
saCfgReloaderStopCh chan struct{}
saCfgReloaderWG sync.WaitGroup
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
@ -62,6 +62,8 @@ func CheckStreamAggrConfig() error {
//
// MustStopStreamAggr must be called when stream aggr is no longer needed.
func InitStreamAggr() {
saCfgReloaderStopCh = make(chan struct{})
if *streamAggrConfig == "" {
return
}

View file

@ -223,19 +223,19 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusOK)
return true
case "/newrelic/api/v1":
case "/newrelic":
newrelicCheckRequest.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/newrelic/api/v1/inventory/deltas":
case "/newrelic/inventory/deltas":
newrelicInventoryRequests.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
return true
case "/newrelic/api/v1/infra/v2/metrics/events/bulk":
case "/newrelic/infra/v2/metrics/events/bulk":
newrelicWriteRequests.Inc()
if err := newrelic.InsertHandlerForHTTP(r); err != nil {
newrelicWriteErrors.Inc()
@ -386,11 +386,11 @@ var (
opentelemetryPushRequests = metrics.NewCounter(`vm_http_requests_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
opentelemetryPushErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/api/v1/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1/inventory/deltas", protocol="newrelic"}`)
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/api/v1", protocol="newrelic"}`)
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/inventory/deltas", protocol="newrelic"}`)
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic", protocol="newrelic"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
promscrapeServiceDiscoveryRequests = metrics.NewCounter(`vm_http_requests_total{path="/service-discovery"}`)

View file

@ -18,7 +18,7 @@ var (
rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="newrelic"}`)
)
// InsertHandlerForHTTP processes remote write for NewRelic POST /infra/v2/metrics/events/bulk request.
// InsertHandlerForHTTP processes remote write for request to /newrelic/infra/v2/metrics/events/bulk request.
func InsertHandlerForHTTP(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
@ -26,42 +26,52 @@ func InsertHandlerForHTTP(req *http.Request) error {
}
ce := req.Header.Get("Content-Encoding")
isGzip := ce == "gzip"
return stream.Parse(req.Body, isGzip, func(series []newrelic.Metric) error {
return insertRows(series, extraLabels)
return stream.Parse(req.Body, isGzip, func(rows []newrelic.Row) error {
return insertRows(rows, extraLabels)
})
}
func insertRows(rows []newrelic.Metric, extraLabels []prompbmarshal.Label) error {
func insertRows(rows []newrelic.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetInsertCtx()
defer common.PutInsertCtx(ctx)
ctx.Reset(len(rows))
samplesCount := 0
for i := range rows {
samplesCount += len(rows[i].Samples)
}
ctx.Reset(samplesCount)
hasRelabeling := relabel.HasRelabeling()
for i := range rows {
r := &rows[i]
ctx.Labels = ctx.Labels[:0]
ctx.AddLabel("", r.Metric)
for j := range r.Tags {
tag := &r.Tags[j]
ctx.AddLabel(tag.Key, tag.Value)
}
for j := range extraLabels {
label := &extraLabels[j]
ctx.AddLabel(label.Name, label.Value)
}
if hasRelabeling {
ctx.ApplyRelabeling()
}
if len(ctx.Labels) == 0 {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
samples := r.Samples
for j := range samples {
s := &samples[j]
ctx.Labels = ctx.Labels[:0]
ctx.AddLabelBytes(nil, s.Name)
for k := range r.Tags {
t := &r.Tags[k]
ctx.AddLabelBytes(t.Key, t.Value)
}
for k := range extraLabels {
label := &extraLabels[k]
ctx.AddLabel(label.Name, label.Value)
}
if hasRelabeling {
ctx.ApplyRelabeling()
}
if len(ctx.Labels) == 0 {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, s.Value); err != nil {
return err
}
}
}
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
rowsInserted.Add(samplesCount)
rowsPerInsert.Update(float64(samplesCount))
return ctx.FlushBufs()
}

View file

@ -93,6 +93,8 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
-customS3Endpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
-deleteAllObjectVersions
Whether to prune previous object versions when deleting an object. By default, when object storage has versioning enabled deleting the file removes only current version. This option forces removal of all previous versions. See: https://docs.victoriametrics.com/vmbackup.html#permanent-deletion-of-objects-in-s3-compatible-storages
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP are used
-envflag.enable
@ -100,7 +102,9 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@ -130,11 +134,11 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
enables offline license verification. License keys issued must support this feature. Contact our support team for license keys with offline check support.
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
path to file with enterprise license key. This flag is available only in VictoriaMetrics enterprise. Documentation - https://docs.victoriametrics.com/enterprise.html, for more information, visit https://victoriametrics.com/products/enterprise/ . To request a trial license, go to https://victoriametrics.com/products/enterprise/trial/
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@ -171,15 +175,11 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-s2a_enable_appengine_dialer
If true, opportunistically use AppEngine-specific dialer to call S2A.
-s2a_timeout duration
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)
-s3StorageClass string
The Storage Class applied to objects uploaded to AWS S3. Supported values are: GLACIER, DEEP_ARCHIVE, GLACIER_IR, INTELLIGENT_TIERING, ONEZONE_IA, OUTPOSTS, REDUCED_REDUNDANCY, STANDARD, STANDARD_IA.
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html/
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html
-skipBackupCompleteCheck
Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file
-src string
@ -203,7 +203,7 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - see `vmutils-*` archives there.
### Development build

View file

@ -172,8 +172,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
// vmui access.
switch {
case path == "/vmui" || path == "/graph":
if path == "/vmui" || path == "/graph" {
// VMUI access via incomplete url without `/` in the end. Redirect to complete url.
// Use relative redirect, since the hostname and path prefix may be incorrect if VictoriaMetrics
// is hidden behind vmauth or similar proxy.
@ -182,29 +181,28 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
newURL := path + "/?" + r.Form.Encode()
httpserver.Redirect(w, newURL)
return true
case strings.HasPrefix(path, "/vmui/"):
if path == "/vmui/custom-dashboards" {
if err := handleVMUICustomDashboards(w); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
}
if strings.HasPrefix(path, "/graph/") {
// This is needed for serving /graph URLs from Prometheus datasource in Grafana.
path = strings.Replace(path, "/graph/", "/vmui/", 1)
}
if path == "/vmui/custom-dashboards" {
if err := handleVMUICustomDashboards(w); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
return true
}
if strings.HasPrefix(path, "/vmui/") {
if strings.HasPrefix(path, "/vmui/static/") {
// Allow clients caching static contents for long period of time, since it shouldn't change over time.
// Path to static contents (such as js and css) must be changed whenever its contents is changed.
// See https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/
w.Header().Set("Cache-Control", "max-age=31536000")
}
r.URL.Path = path
vmuiFileServer.ServeHTTP(w, r)
return true
case strings.HasPrefix(path, "/graph/"):
// This is needed for serving /graph URLs from Prometheus datasource in Grafana.
if path == "/graph/custom-dashboards" {
if err := handleVMUICustomDashboards(w); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
return true
}
r.URL.Path = strings.Replace(path, "/graph/", "/vmui/", 1)
vmuiFileServer.ServeHTTP(w, r)
return true
}
if strings.HasPrefix(path, "/api/v1/label/") {

View file

@ -26,6 +26,11 @@ var (
maxTagValuesPerSearch = flag.Int("search.maxTagValues", 100e3, "The maximum number of tag values returned from /api/v1/label/<label_name>/values")
maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage")
maxSamplesPerQuery = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries")
maxWorkersPerQuery = flag.Int("search.maxWorkersPerQuery", defaultMaxWorkersPerQuery, "The maximum number of CPU cores a single query can use. "+
"The default value should work good for most cases. "+
"The flag can be set to lower values for improving performance of big number of concurrently executed queries. "+
"The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). "+
"There is no sense in setting this flag to values bigger than the number of CPU cores available on the system")
)
// Result is a single timeseries result.
@ -197,13 +202,34 @@ type result struct {
var resultPool sync.Pool
// MaxWorkers returns the maximum number of workers netstorage can spin when calling RunParallel()
// MaxWorkers returns the maximum number of concurrent goroutines, which can be used by RunParallel()
func MaxWorkers() int {
return gomaxprocs
n := *maxWorkersPerQuery
if n <= 0 {
return defaultMaxWorkersPerQuery
}
if n > gomaxprocs {
// There is no sense in running more than gomaxprocs CPU-bound concurrent workers,
// since this may worsen the query performance.
n = gomaxprocs
}
return n
}
var gomaxprocs = cgroup.AvailableCPUs()
var defaultMaxWorkersPerQuery = func() int {
// maxWorkersLimit is the maximum number of CPU cores, which can be used in parallel
// for processing an average query, without significant impact on inter-CPU communications.
const maxWorkersLimit = 32
n := gomaxprocs
if n > maxWorkersLimit {
n = maxWorkersLimit
}
return n
}()
// RunParallel runs f in parallel for all the results from rss.
//
// f shouldn't hold references to rs after returning.

View file

@ -2013,6 +2013,90 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`labels_equal()`, func(t *testing.T) {
t.Parallel()
q := `sort(labels_equal((
label_set(10, "instance", "qwe", "host", "rty"),
label_set(20, "instance", "qwe", "host", "qwe"),
label_set(30, "aaa", "bbb", "instance", "foo", "host", "foo"),
), "instance", "host"))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{20, 20, 20, 20, 20, 20},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{
{
Key: []byte("host"),
Value: []byte("qwe"),
},
{
Key: []byte("instance"),
Value: []byte("qwe"),
},
}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{30, 30, 30, 30, 30, 30},
Timestamps: timestampsExpected,
}
r2.MetricName.Tags = []storage.Tag{
{
Key: []byte("aaa"),
Value: []byte("bbb"),
},
{
Key: []byte("host"),
Value: []byte("foo"),
},
{
Key: []byte("instance"),
Value: []byte("foo"),
},
}
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
t.Run(`drop_empty_series()`, func(t *testing.T) {
t.Parallel()
q := `sort(drop_empty_series(
(
alias(time(), "foo"),
alias(500 + time(), "bar"),
) > 2000
) default 123)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{123, 123, 123, 2100, 2300, 2500},
Timestamps: timestampsExpected,
}
r.MetricName.MetricGroup = []byte("bar")
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`no drop_empty_series()`, func(t *testing.T) {
t.Parallel()
q := `sort((
(
alias(time(), "foo"),
alias(500 + time(), "bar"),
) > 2000
) default 123)`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{123, 123, 123, 123, 123, 123},
Timestamps: timestampsExpected,
}
r1.MetricName.MetricGroup = []byte("foo")
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{123, 123, 123, 2100, 2300, 2500},
Timestamps: timestampsExpected,
}
r2.MetricName.MetricGroup = []byte("bar")
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
t.Run(`drop_common_labels(single_series)`, func(t *testing.T) {
t.Parallel()
q := `drop_common_labels(label_set(time(), "foo", "bar", "__name__", "xxx", "q", "we"))`
@ -8960,6 +9044,9 @@ func TestExecError(t *testing.T) {
f(`delta_prometheus()`)
f(`rollup_candlestick()`)
f(`rollup()`)
f(`drop_empty_series()`)
f(`drop_common_labels()`)
f(`labels_equal()`)
// Invalid argument type
f(`median_over_time({}, 2)`)

View file

@ -45,6 +45,7 @@ var transformFuncs = map[string]transformFunc{
"days_in_month": newTransformFuncDateTime(transformDaysInMonth),
"deg": newTransformFuncOneArg(transformDeg),
"drop_common_labels": transformDropCommonLabels,
"drop_empty_series": transformDropEmptySeries,
"end": newTransformFuncZeroArgs(transformEnd),
"exp": newTransformFuncOneArg(transformExp),
"floor": newTransformFuncOneArg(transformFloor),
@ -74,6 +75,7 @@ var transformFuncs = map[string]transformFunc{
"label_uppercase": transformLabelUppercase,
"label_value": transformLabelValue,
"limit_offset": transformLimitOffset,
"labels_equal": transformLabelsEqual,
"ln": newTransformFuncOneArg(transformLn),
"log2": newTransformFuncOneArg(transformLog2),
"log10": newTransformFuncOneArg(transformLog10),
@ -420,7 +422,7 @@ func transformBucketsLimit(tfa *transformFuncArg) ([]*timeseries, error) {
mn.CopyFrom(&ts.MetricName)
mn.RemoveTag("le")
b = marshalMetricNameSorted(b[:0], &mn)
k := bytesutil.ToUnsafeString(b)
k := string(b)
m[k] = append(m[k], x{
le: le,
ts: ts,
@ -1840,6 +1842,15 @@ func transformDropCommonLabels(tfa *transformFuncArg) ([]*timeseries, error) {
return rvs, nil
}
func transformDropEmptySeries(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 1); err != nil {
return nil, err
}
rvs := removeEmptySeries(args[0])
return rvs, nil
}
func transformLabelCopy(tfa *transformFuncArg) ([]*timeseries, error) {
return transformLabelCopyExt(tfa, false)
}
@ -2013,6 +2024,43 @@ func labelReplace(tss []*timeseries, srcLabel string, r *regexp.Regexp, dstLabel
return tss, nil
}
func transformLabelsEqual(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if len(args) < 3 {
return nil, fmt.Errorf("unexpected number of args; got %d; want at least 3", len(args))
}
tss := args[0]
var labelNames []string
for i, ts := range args[1:] {
labelName, err := getString(ts, i+1)
if err != nil {
return nil, fmt.Errorf("cannot get label name: %w", err)
}
labelNames = append(labelNames, labelName)
}
rvs := tss[:0]
for _, ts := range tss {
if hasIdenticalLabelValues(&ts.MetricName, labelNames) {
rvs = append(rvs, ts)
}
}
return rvs, nil
}
func hasIdenticalLabelValues(mn *storage.MetricName, labelNames []string) bool {
if len(labelNames) < 2 {
return true
}
labelValue := mn.GetTagValue(labelNames[0])
for _, labelName := range labelNames[1:] {
b := mn.GetTagValue(labelName)
if string(labelValue) != string(b) {
return false
}
}
return true
}
func transformLabelValue(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 2); err != nil {

View file

@ -1,14 +1,13 @@
{
"files": {
"main.css": "./static/css/main.4ab6595e.css",
"main.js": "./static/js/main.dbf8fb4f.js",
"main.css": "./static/css/main.d9ac05de.css",
"main.js": "./static/js/main.70434a4f.js",
"static/js/522.b5ae4365.chunk.js": "./static/js/522.b5ae4365.chunk.js",
"static/media/Lato-Regular.ttf": "./static/media/Lato-Regular.d714fec1633b69a9c2e9.ttf",
"static/media/Lato-Bold.ttf": "./static/media/Lato-Bold.32360ba4b57802daa4d6.ttf",
"static/media/MetricsQL.md": "./static/media/MetricsQL.957b90ab4cb4852eec26.md",
"index.html": "./index.html"
},
"entrypoints": [
"static/css/main.4ab6595e.css",
"static/js/main.dbf8fb4f.js"
"static/css/main.d9ac05de.css",
"static/js/main.70434a4f.js"
]
}

View file

@ -1 +1 @@
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.dbf8fb4f.js"></script><link href="./static/css/main.4ab6595e.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.70434a4f.js"></script><link href="./static/css/main.d9ac05de.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show more