diff --git a/.gitignore b/.gitignore index b5246b398..0b323d148 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,10 @@ /victoria-metrics-data /vmstorage-data /vmselect-cache +.DS_Store + + +### terraform +terraform.tfstate +terraform.tfstate.* +.terraform/ diff --git a/Makefile b/Makefile index 9d258dfa6..d3a614cb0 100644 --- a/Makefile +++ b/Makefile @@ -11,16 +11,26 @@ endif GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)' all: \ - victoria-metrics-prod + vminsert \ + vmselect \ + vmstorage include app/*/Makefile include deployment/*/Makefile +include deployment/*/helm/Makefile clean: rm -rf bin/* -release: victoria-metrics-prod - cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz victoria-metrics-prod +publish: \ + publish-vmstorage \ + publish-vmselect \ + publish-vminsert + +package: \ + package-vmstorage \ + package-vmselect \ + package-vminsert fmt: go fmt $(PKG_PREFIX)/lib/... @@ -57,6 +67,9 @@ vendor-update: go mod tidy go mod vendor +app-local: + GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME) + quicktemplate-gen: install-qtc qtc diff --git a/README.md b/README.md index 33a62a5f6..6738de51d 100644 --- a/README.md +++ b/README.md @@ -1,386 +1,170 @@ Victoria Metrics -## Single-node VictoriaMetrics +# Cluster version of VictoriaMetrics -[![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) +VictoriaMetrics is fast and cost-effective long-term remote storage for Prometheus. -VictoriaMetrics is a long-term remote storage for Prometheus. -It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), -[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) and -in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). - -Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster). +Single-node version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics). ## Prominent features -* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana. - Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/ExtendedPromQL). -* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b) - and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4). - [Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae). -* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality). -* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) - may be crammed into a limited storage comparing to TimescaleDB. -* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b). -* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, Uber M3, Cortex, InfluxDB or TimescaleDB. - See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae). -* Easy operation: - * VictoriaMetrics consists of a single executable without external dependencies. - * All the configuration is done via explicit command-line flags with reasonable defaults. - * All the data is stored in a single directory pointed by `-storageDataPath` flag. - * Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282). -* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282). -* Supports metrics' ingestion and backfilling via the following protocols: - * [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) - * [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/) - * [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon) - if `-graphiteListenAddr` is set. - * [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set. -* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors. -* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster). +- Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics). +- Scales horizontally to multiple nodes. +- Supports multiple independent namespaces for time series data (aka multi-tenancy). -## Operation +## Architecture overview + +VictoriaMetrics cluster consists of the following services: + +- `vmstorage` - stores the data +- `vminsert` - proxies the ingested data to `vmstorage` +- `vmselect` - performs incoming queries using the data from `vmstorage` + +Each service may scale independently and may run on the most suitable hardware. -### Table of contents +## Building from sources -* [How to build from sources](#how-to-build-from-sources) -* [How to start VictoriaMetrics](#how-to-start-victoriametrics) -* [Prometheus setup](#prometheus-setup) -* [Grafana setup](#grafana-setup) -* [How to send data from InfluxDB-compatible agents such as Telegraf](#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) -* [How to send data from Graphite-compatible agents such as StatsD](#how-to-send-data-from-graphite-compatible-agents-such-as-statsd) -* [How to send data from OpenTSDB-compatible agents](#how-to-send-data-from-opentsdb-compatible-agents) -* [How to apply new config / ugrade VictoriaMetrics](#how-to-apply-new-config--upgrade-victoriametrics) -* [How to work with snapshots](#how-to-work-with-snapshots) -* [How to delete time series](#how-to-delete-time-series) -* [How to export time series](#how-to-export-time-series) -* [Federation](#federation) -* [Capacity planning](#capacity-planning) -* [High Availability](#high-availability) -* [Multiple retentions](#multiple-retentions) -* [Scalability and cluster version](#scalability-and-cluster-version) -* [Security](#security) -* [Tuning](#tuning) -* [Monitoring](#monitoring) -* [Troubleshooting](#troubleshooting) -* [Community and contributions](#community-and-contributions) -* [Reporting bugs](#reporting-bugs) +### Development Builds + +1. [Install go](https://golang.org/doc/install). The minimum supported version is Go 1.12. +2. Run `make` from the repository root. It should build `vmstorage`, `vmselect` + and `vminsert` binaries and put them into the `bin` folder. -### How to build from sources +### Production builds -We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or -[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) instead of building VictoriaMetrics -from sources. Building from sources is reasonable when developing an additional features specific -to your needs. +There is no need in installing Go on a host system since binaries are built +inside [the official docker container for Go](https://hub.docker.com/_/golang). +This makes reproducible builds. +So [install docker](https://docs.docker.com/install/) and run the following command: +``` +make vminsert-prod vmselect-prod vmstorage-prod +``` -#### Development build +Production binaries are built into statically linked binaries for `GOARCH=amd64`, `GOOS=linux`. +They are put into `bin` folder with `-prod` suffixes: +``` +$ make vminsert-prod vmselect-prod vmstorage-prod +$ ls -1 bin +vminsert-prod +vmselect-prod +vmstorage-prod +``` -1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12. -2. Run `go build ./app/victoria-metrics` from the root folder of the repository. - It will build `victoria-metrics` binary in the root folder of the repository. +### Building docker images -#### Production build +Run `make package`. It will build the following docker images locally: -1. [Install docker](https://docs.docker.com/install/). -2. Run `make victoria-metrics-prod` from the root folder of the respository. - It will build `victoria-metrics-prod` binary and put it into the `bin` folder. +* `valyala/vminsert:` +* `valyala/vmselect:` +* `valyala/vmstorage:` -#### Building docker images - -Run `make package-victoria-metrics`. It will build `valyala/victoria-metrics:` docker image locally. `` is auto-generated image tag, which depends on source code in the repository. The `` may be manually set via `PKG_TAG=foobar make package`. -### How to start VictoriaMetrics +## Operation -Just start VictoriaMetrics executable or docker image with the desired command-line flags. +### Cluster setup -The following command line flags are used the most: +A minimal cluster must contain the following nodes: -* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. -* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. -* `-httpListenAddr` - TCP address to listen to for http requests. By default it listens port `8428` on all the network interfaces. -* `-graphiteListenAddr` - TCP and UDP address to listen to for Graphite data. By default it is disabled. -* `-opentsdbListenAddr` - TCP and UDP address to listen to for OpenTSDB data. By default it is disabled. +* a single `vmstorage` node with `-retentionPeriod` and `-storageDataPath` flags +* a single `vminsert` node with `-storageNode=:8400` +* a single `vmselect` node with `-storageNode=:8401` -Pass `-help` to see all the available flags with description and default values. +It is recommended to run at least two nodes for each service +for high availability purposes. +An http load balancer must be put in front of `vminsert` and `vmselect` nodes: +- requests starting with `/insert` must be routed to port `8480` on `vminsert` nodes. +- requests starting with `/select` must be routed to port `8481` on `vmselect` nodes. -### Prometheus setup +Ports may be altered by setting `-httpListenAddr` on the corresponding nodes. -Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`): -```yml -remote_write: - - url: http://:8428/api/v1/write - queue_config: - max_samples_per_send: 10000 -``` +### URL format -Substitute `` with the hostname or IP address of VictoriaMetrics. -Then apply the new config via the following command: +* URLs for data ingestion: `/insert//`, where: + - `` is an arbitrary number identifying namespace for data ingestion + - `` may have the following values: + - `prometheus` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) + - `influx/write` or `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/) -``` -kill -HUP `pidof prometheus` -``` +* URLs for querying: `/select//prometheus/`, where: + - `` is an arbitrary number identifying data namespace for the query + - `` may have the following values: + - `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) + - `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) + - `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) + - `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names) + - `api/v1/label//values` - returns values for the given `` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values) + - `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/) + - `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details -Prometheus writes incoming data to local storage and to remote storage in parallel. -This means the data remains available in local storage for `--storage.tsdb.retention.time` duration -if remote storage stops working. +* `vmstorage` nodes provide the following HTTP endpoints on `8482` port: + - `/snapshot/create` - create [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282), + which can be used for backups in background. Snapshots are created in `/snapshots` folder, where `` is the corresponding + command-line flag value. + - `/snapshot/list` - list available snasphots. + - `/snapshot/delete?snapshot=` - delete the given snapshot. + - `/snapshot/delete_all` - delete all the snapshots. -If you plan sending data to VictoriaMetrics from multiple Prometheus instances, then add the following lines into `global` section -of [Prometheus config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#configuration-file): + Snapshots may be created independently on each `vmstorage` node. There is no need in synchronizing snapshots' creation + across `vmstorage` nodes. -```yml -global: - external_labels: - datacenter: dc-123 -``` -This instructs Prometheus to add `datacenter=dc-123` label to each time series sent to remote storage. -The label name may be arbitrary - `datacenter` is just an example. The label value must be unique -across Prometheus instances, so time series may be filtered and grouped by this label. +### Cluster resizing +* `vminsert` and `vmselect` nodes are stateless and may be added / removed at any time. + Do not forget updating the list of these nodes on http load balancer. +* `vmstorage` nodes own the ingested data, so they cannot be removed without data loss. -### Grafana setup +Steps to add `vmstorage` node: -Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following Url: +1. Start new `vmstorage` node. +2. Gradually restart all the `vmselect` nodes with new `-storageNode` arg containing `:8401`. +3. Gradually restart all the `vminsert` nodes with new `-storageNode` arg containing `:8400`. -``` -http://:8428 -``` -Substitute `` with the hostname or IP address of VictoriaMetrics. +### Cluster availability -Then build graphs with the created datasource using [Prometheus query language](https://prometheus.io/docs/prometheus/latest/querying/basics/). -VictoriaMetrics supports native PromQL and [extends it with useful features](ExtendedPromQL). +* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes. +* The cluster remains available if at least a single `vmstorage` node exists: + - `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes + - `vmselect` continues serving partial responses if at least a single `vmstorage` node is available. -### How to send data from InfluxDB-compatible agents such as [Telegraf](https://www.influxdata.com/time-series-platform/telegraf/)? -Just use `http://:8428` url instead of InfluxDB url in agents' configs. -For instance, put the following lines into `Telegraf` config, so it sends data to VictoriaMetrics instead of InfluxDB: +### Updating / reconfiguring cluster nodes -``` -[[outputs.influxdb]] - urls = ["http://:8428"] -``` +All the node types - `vminsert`, `vmselect` and `vmstorage` - may be updated via graceful shutdown. +Send `SIGINT` signal to the corresponding process, wait until it finishes and then start new version +with new configs. -Do not forget substituting `` with the real address where VictoriaMetrics runs. +Cluster should remain in working state if at least a single node of each type remains available during +the update process. See [cluster availability](cluster-availability) section for details. -VictoriaMetrics maps Influx data using the following rules: -* [`db` query arg](https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint) is mapped into `db` label value -* Field names are mapped to time series names prefixed by `{measurement}.` value -* Field values are mapped to time series values -* Tags are mapped to Prometheus labels as-is +### Helm -### How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)? +* Helm chart is available in the `deployment/k8s/helm/victoria-metrics` folder. -1) Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance, -the following command will enable Graphite receiver in VictoriaMetrics on TCP and UDP port `2003`: +1. Install Cluster: `helm install -n deployment/k8s/helm/victoria-mertrics` or `ENV= make helm-install`. +2. Upgrade Cluster: `helm upgrade deployment/k8s/helm/victoria-mertrics` or `ENV= make helm-upgrade`. +3. Delete Cluster: `helm del --purge ` or `ENV= make helm-delete`. -``` -/path/to/victoria-metrics-prod ... -graphiteListenAddr=:2003 -``` +* Upgrade follows `Cluster resizing procedure` under the hood. -2) Use the configured address in Graphite-compatible agents. For instance, set `graphiteHost` -to the VictoriaMetrics host in `StatsD` configs. - - -### How to send data from OpenTSDB-compatible agents? - -1) Enable OpenTSDB receiver in VictoriaMetrics by setting `-opentsdbListenAddr` command line flag. For instance, -the following command will enable OpenTSDB receiver in VictoriaMetrics on TCP and UDP port `4242`: - -``` -/path/to/victoria-metrics-prod ... -opentsdbListenAddr=:4242 -``` - -2) Send data to the given address from OpenTSDB-compatible agents. - - -### How to apply new config / upgrade VictoriaMetrics? - -VictoriaMetrics must be restarted in order to upgrade or apply new config: - -1) Send `SIGINT` signal to VictoriaMetrics process in order to gracefully stop it. -2) Wait until the process stops. This can take a few seconds. -3) Start the upgraded VictoriaMetrics with new config. - - -### How to work with snapshots? - -Navigate to `http://:8428/snapshot/create` in order to create an instant snapshot. -The page will return the following JSON response: - -``` -{"status":"ok","snapshot":""} -``` - -Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-storageDataPath>` -is the command-line flag value. Snapshots can be archived to backup storage via `rsync -L`, `scp -r` -or any similar tool that follows symlinks during copying. - -The `http://:8428/snapshot/list` page contains the list of available snapshots. - -Navigate to `http://:8428/snapshot/delete?snapshot=` in order -to delete `` snapshot. - -Navigate to `http://:8428/snapshot/delete_all` in order to delete all the snapshots. - - -### How to delete time series? - -Send a request to `http://:8428/api/v1/admin/tsdb/delete_series?match[]=`, -where `` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) -for metrics to delete. After that all the time series matching the given selector are deleted. Storage space for -the deleted time series isn't freed instantly - it is freed during subsequent merges of data files. - - -### How to export time series? - -Send a request to `http://:8428/api/v1/export?match[]=`, -where `` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) -for metrics to export. The response would contain all the data for the selected time series in [JSON streaming format](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON). -Each JSON line would contain data for a single time series. An example output: - -``` -{"metric":{"__name__":"up","job":"node_exporter","instance":"localhost:9100"},"values":[0,0,0],"timestamps":[1549891472010,1549891487724,1549891503438]} -{"metric":{"__name__":"up","job":"prometheus","instance":"localhost:9090"},"values":[1,1,1],"timestamps":[1549891461511,1549891476511,1549891491511]} -``` - -Optional `start` and `end` args may be added to the request in order to limit the time frame for the exported data. These args may contain either -unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values. - - -### Federation - -VictoriaMetrics exports [Prometheus-compatible federation data](https://prometheus.io/docs/prometheus/latest/federation/) -at `http://:8428/federate?match[]=`. - -Optional `start` and `end` args may be added to the request in order to scrape the last point for each selected time series on the `[start ... end]` interval. -`start` and `end` may contain either unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values. By default the last point -on the interval `[now - max_lookback ... now]` is scraped for each time series. Default value for `max_lookback` is `5m` (5 minutes), but can be overriden. -For instance, `/federate?match[]=up&max_lookback=1h` would return last points on the `[now - 1h ... now]` interval. This may be useful for time series federation -with scrape intervals exceeding `5m`. - - -### Capacity planning - -Rough estimation of the required resources: - -* RAM size: less than 1KB per active time series. So, ~1GB of RAM is required for 1M active time series. - Time series is considered active if new data points have been added to it recently or if it has been recently queried. - VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with `-memory.allowedPercent` flag. -* CPU cores: a CPU core per 300K inserted data points per second. So, ~4 CPU cores are required for processing - the insert stream of 1M data points per second. - If you see lower numbers per CPU core, then it is likely active time series info doesn't fit caches, - so you need more RAM for lowering CPU usage. -* Storage size: less than a byte per data point on average. So, ~260GB is required for storing a month-long insert stream - of 100K data points per second. - The actual storage size heavily depends on data randomness (entropy). Higher randomness means higher storage size requirements. - - -### High availability - -1) Install multiple VictoriaMetrics instances in distinct datacenters. -2) Add addresses of these instances to `remote_write` section in Prometheus config: - -```yml -remote_write: - - url: http://:8428/api/v1/write - queue_config: - max_samples_per_send: 10000 - # ... - - url: http://:8428/api/v1/write - queue_config: - max_samples_per_send: 10000 -``` - -3) Apply the updated config: - -``` -kill -HUP `pidof prometheus` -``` - -4) Now Prometheus should write data into all the configured `remote_write` urls in parallel. -5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas. -6) Set up Prometheus datasource in Grafana that points to Promxy. - - -### Multiple retentions - -Just start multiple VictoriaMetrics instances with distinct values for the following flags: - -* `-retentionPeriod` -* `-storageDataPath`, so the data for each retention period is saved in a separate directory -* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention - - -### Scalability and cluster version - -Though single-node VictoriaMetrics cannot scale to multiple nodes, it is optimized for resource usage - storage size / bandwidth / IOPS, RAM, CPU. -This means that a single-node VictoriaMetrics may scale vertically and substitute moderately sized cluster built with competing solutions -such as Thanos, Uber M3, InfluxDB or TimescaleDB. - -So try single-node VictoriaMetrics at first and then [switch to cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster) if you still need -horizontally scalable long-term remote storage for really large Prometheus deployments. -[Contact us](mailto:info@victoriametrics.com) for paid support. - - -### Security - -Do not forget protecting sensitive endpoints in VictoriaMetrics when exposing it to untrusted networks such as internet. -Consider setting the following command-line flags: - -* `-tls`, `-tlsCertFile` and `-tlsKeyFile` for switching from HTTP to HTTPS. -* `-httpAuth.username` and `-httpAuth.password` for protecting all the HTTP endpoints - with [HTTP Basic Authentication](https://en.wikipedia.org/wiki/Basic_access_authentication). -* `-deleteAuthKey` for protecting `/api/v1/admin/tsdb/delete_series` endpoint. See [how to delete time series](#how-to-delete-time-series). -* `-snapshotAuthKey` for protecting `/snapshot*` endpoints. See [how to work with snapshots](#how-to-work-with-snapshots). - -Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats. -For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=:2003`. - - -### Tuning - -* There is no need in VictoriaMetrics tuning, since it uses reasonable defaults for command-line flags, - which are automatically adjusted for the available CPU and RAM resources. -* There is no need in Operating System tuning, since VictoriaMetrics is optimized for default OS settings. - The only option is increasing the limit on [the number open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a), - so Prometheus instances could establish more connections to VictoriaMetrics. - - -### Monitoring - -VictoriaMetrics exports internal metrics in Prometheus format on the `/metrics` page. -Add this page to Prometheus' scrape config in order to collect VictoriaMetrics metrics. -There is [an official Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229). - - -### Troubleshooting - -* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, - then it is likely you have too many active time series for the current amount of RAM. - It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve - ingestion performance. - Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this - option, since too big value for `-memory.allowedPercent` may result in high I/O usage. ## Community and contributions -Feel free asking any questions regarding VictoriaMetrics [here](https://groups.google.com/forum/#!forum/victorametrics-users). - We are open to third-party pull requests provided they follow [KISS design principle](https://en.wikipedia.org/wiki/KISS_principle): - Prefer simple code and architecture. @@ -392,6 +176,17 @@ We are open to third-party pull requests provided they follow [KISS design princ Adhering `KISS` principle simplifies the resulting code and architecture, so it can be reviewed, understood and verified by many people. +Due to `KISS` cluster version of VictoriaMetrics has no the following "features" popular in distributed computing world: + +- Fragile [gossip protocols](https://github.com/improbable-eng/thanos/blob/master/docs/proposals/approved/201809_gossip-removal.md). +- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm). +- Complex replication schemes, which may go nuts in unforesseen edge cases. The replication is offloaded to the underlying durable replicated storage + such as [persistent disks in Google Compute Engine](https://cloud.google.com/compute/docs/disks/#pdspecs). +- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability. +- Automatic cluster resizing, which may cost you a lot of money if improperly configured. +- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :) +- Automatic leader election, which may result in split brain disaster on network errors. + ## Reporting bugs diff --git a/app/victoria-metrics/Makefile b/app/victoria-metrics/Makefile deleted file mode 100644 index 6077a7d83..000000000 --- a/app/victoria-metrics/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# All these commands must run from repository root. - -victoria-metrics-prod: - APP_NAME=victoria-metrics $(MAKE) app-via-docker - -package-victoria-metrics: - APP_NAME=victoria-metrics \ - $(MAKE) package-via-docker - -publish-victoria-metrics: - APP_NAME=victoria-metrics $(MAKE) publish-via-docker - -run-victoria-metrics: - mkdir -p victoria-metrics-data - DOCKER_OPTS='-v $(shell pwd)/victoria-metrics-data:/victoria-metrics-data -p 8428:8428 -p 2003:2003 -p 2003:2003/udp' \ - APP_NAME=victoria-metrics \ - ARGS='-graphiteListenAddr=:2003 -opentsdbListenAddr=:4242 -retentionPeriod=12 -search.maxUniqueTimeseries=1000000 -search.maxQueryDuration=10m' \ - $(MAKE) run-via-docker - -victoria-metrics-arm: - CC=arm-linux-gnueabi-gcc CGO_ENABLED=1 GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/victoria-metrics-arm ./app/victoria-metrics diff --git a/app/victoria-metrics/main.go b/app/victoria-metrics/main.go deleted file mode 100644 index 0dead5cc0..000000000 --- a/app/victoria-metrics/main.go +++ /dev/null @@ -1,60 +0,0 @@ -package main - -import ( - "flag" - "net/http" - "time" - - "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" -) - -var httpListenAddr = flag.String("httpListenAddr", ":8428", "TCP address to listen for http connections") - -func main() { - flag.Parse() - buildinfo.Init() - logger.Init() - logger.Infof("starting VictoraMetrics at %q...", *httpListenAddr) - startTime := time.Now() - vmstorage.Init() - vmselect.Init() - vminsert.Init() - - go httpserver.Serve(*httpListenAddr, requestHandler) - logger.Infof("started VictoriaMetrics in %s", time.Since(startTime)) - - sig := procutil.WaitForSigterm() - logger.Infof("received signal %s", sig) - - logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr) - startTime = time.Now() - if err := httpserver.Stop(*httpListenAddr); err != nil { - logger.Fatalf("cannot stop the webservice: %s", err) - } - vminsert.Stop() - logger.Infof("successfully shut down the webservice in %s", time.Since(startTime)) - - vmstorage.Stop() - vmselect.Stop() - - logger.Infof("the VictoriaMetrics has been stopped in %s", time.Since(startTime)) -} - -func requestHandler(w http.ResponseWriter, r *http.Request) bool { - if vminsert.RequestHandler(w, r) { - return true - } - if vmselect.RequestHandler(w, r) { - return true - } - if vmstorage.RequestHandler(w, r) { - return true - } - return false -} diff --git a/app/vminsert/Makefile b/app/vminsert/Makefile new file mode 100644 index 000000000..1a798b56f --- /dev/null +++ b/app/vminsert/Makefile @@ -0,0 +1,31 @@ +# All these commands must run from repository root. + +run-vminsert: + DOCKER_OPTS='-p 8480:8480' \ + APP_NAME=vminsert \ + ARGS='-storageNode=localhost:8400' \ + $(MAKE) run-via-docker + +vminsert: + APP_NAME=vminsert $(MAKE) app-local + +vminsert-race: + APP_NAME=vminsert RACE=-race $(MAKE) app-local + +vminsert-prod: + APP_NAME=vminsert $(MAKE) app-via-docker + +vminsert-prod-race: + APP_NAME=vminsert RACE=-race $(MAKE) app-via-docker + +package-vminsert: + APP_NAME=vminsert $(MAKE) package-via-docker + +package-vminsert-race: + APP_NAME=vminsert RACE=-race $(MAKE) package-via-docker + +publish-vminsert: + APP_NAME=vminsert $(MAKE) publish-via-docker + +publish-vminsert-race: + APP_NAME=vminsert RACE=-race $(MAKE) publish-via-docker diff --git a/app/vminsert/README.md b/app/vminsert/README.md index 050290fa6..02705f5b7 100644 --- a/app/vminsert/README.md +++ b/app/vminsert/README.md @@ -1 +1 @@ -`vminsert` routes the ingested data to `vmstorage`. +`vminsert` routes the ingested data to `vmstorage` nodes. diff --git a/app/vminsert/common/insert_ctx.go b/app/vminsert/common/insert_ctx.go deleted file mode 100644 index 65fff3ec1..000000000 --- a/app/vminsert/common/insert_ctx.go +++ /dev/null @@ -1,106 +0,0 @@ -package common - -import ( - "fmt" - - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" -) - -// InsertCtx contains common bits for data points insertion. -type InsertCtx struct { - Labels []prompb.Label - - mrs []storage.MetricRow - metricNamesBuf []byte -} - -// Reset resets ctx for future fill with rowsLen rows. -func (ctx *InsertCtx) Reset(rowsLen int) { - for _, label := range ctx.Labels { - label.Name = nil - label.Value = nil - } - ctx.Labels = ctx.Labels[:0] - - for i := range ctx.mrs { - mr := &ctx.mrs[i] - mr.MetricNameRaw = nil - } - ctx.mrs = ctx.mrs[:0] - - if n := rowsLen - cap(ctx.mrs); n > 0 { - ctx.mrs = append(ctx.mrs[:cap(ctx.mrs)], make([]storage.MetricRow, n)...) - } - ctx.mrs = ctx.mrs[:rowsLen] - ctx.metricNamesBuf = ctx.metricNamesBuf[:0] -} - -func (ctx *InsertCtx) marshalMetricNameRaw(prefix []byte, labels []prompb.Label) []byte { - start := len(ctx.metricNamesBuf) - ctx.metricNamesBuf = append(ctx.metricNamesBuf, prefix...) - ctx.metricNamesBuf = storage.MarshalMetricNameRaw(ctx.metricNamesBuf, labels) - metricNameRaw := ctx.metricNamesBuf[start:] - return metricNameRaw[:len(metricNameRaw):len(metricNameRaw)] -} - -// WriteDataPoint writes (timestamp, value) with the given prefix and lables into ctx buffer. -func (ctx *InsertCtx) WriteDataPoint(prefix []byte, labels []prompb.Label, timestamp int64, value float64) { - metricNameRaw := ctx.marshalMetricNameRaw(prefix, labels) - ctx.addRow(metricNameRaw, timestamp, value) -} - -// WriteDataPointExt writes (timestamp, value) with the given metricNameRaw and labels into ctx buffer. -// -// It returns metricNameRaw for the given labels if len(metricNameRaw) == 0. -func (ctx *InsertCtx) WriteDataPointExt(metricNameRaw []byte, labels []prompb.Label, timestamp int64, value float64) []byte { - if len(metricNameRaw) == 0 { - metricNameRaw = ctx.marshalMetricNameRaw(nil, labels) - } - ctx.addRow(metricNameRaw, timestamp, value) - return metricNameRaw -} - -func (ctx *InsertCtx) addRow(metricNameRaw []byte, timestamp int64, value float64) { - mrs := ctx.mrs - if cap(mrs) > len(mrs) { - mrs = mrs[:len(mrs)+1] - } else { - mrs = append(mrs, storage.MetricRow{}) - } - mr := &mrs[len(mrs)-1] - ctx.mrs = mrs - mr.MetricNameRaw = metricNameRaw - mr.Timestamp = timestamp - mr.Value = value -} - -// AddLabel adds (name, value) label to ctx.Labels. -// -// name and value must exist until ctx.Labels is used. -func (ctx *InsertCtx) AddLabel(name, value string) { - labels := ctx.Labels - if cap(labels) > len(labels) { - labels = labels[:len(labels)+1] - } else { - labels = append(labels, prompb.Label{}) - } - label := &labels[len(labels)-1] - - // Do not copy name and value contents for performance reasons. - // This reduces GC overhead on the number of objects and allocations. - label.Name = bytesutil.ToUnsafeBytes(name) - label.Value = bytesutil.ToUnsafeBytes(value) - - ctx.Labels = labels -} - -// FlushBufs flushes buffered rows to the underlying storage. -func (ctx *InsertCtx) FlushBufs() error { - if err := vmstorage.AddRows(ctx.mrs); err != nil { - return fmt.Errorf("cannot store metrics: %s", err) - } - return nil -} diff --git a/app/victoria-metrics/deployment/Dockerfile b/app/vminsert/deployment/Dockerfile similarity index 57% rename from app/victoria-metrics/deployment/Dockerfile rename to app/vminsert/deployment/Dockerfile index f47a803a6..8d2b114b0 100644 --- a/app/victoria-metrics/deployment/Dockerfile +++ b/app/vminsert/deployment/Dockerfile @@ -1,5 +1,5 @@ FROM scratch COPY --from=local/certs:1.0.2 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt -COPY bin/victoria-metrics-prod . -EXPOSE 8428 -ENTRYPOINT ["/victoria-metrics-prod"] +COPY bin/vminsert-prod . +EXPOSE 8480 +ENTRYPOINT ["/vminsert-prod"] diff --git a/app/vminsert/graphite/request_handler.go b/app/vminsert/graphite/request_handler.go index 7c59d7cb2..0d82356a9 100644 --- a/app/vminsert/graphite/request_handler.go +++ b/app/vminsert/graphite/request_handler.go @@ -9,8 +9,9 @@ import ( "sync" "time" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/metrics" ) @@ -20,27 +21,27 @@ var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="graphite"}`) // insertHandler processes remote write for graphite plaintext protocol. // // See https://graphite.readthedocs.io/en/latest/feeding-carbon.html#the-plaintext-protocol -func insertHandler(r io.Reader) error { +func insertHandler(at *auth.Token, r io.Reader) error { return concurrencylimiter.Do(func() error { - return insertHandlerInternal(r) + return insertHandlerInternal(at, r) }) } -func insertHandlerInternal(r io.Reader) error { +func insertHandlerInternal(at *auth.Token, r io.Reader) error { ctx := getPushCtx() defer putPushCtx(ctx) for ctx.Read(r) { - if err := ctx.InsertRows(); err != nil { + if err := ctx.InsertRows(at); err != nil { return err } } return ctx.Error() } -func (ctx *pushCtx) InsertRows() error { +func (ctx *pushCtx) InsertRows(at *auth.Token) error { rows := ctx.Rows.Rows ic := &ctx.Common - ic.Reset(len(rows)) + ic.Reset() for i := range rows { r := &rows[i] ic.Labels = ic.Labels[:0] @@ -49,7 +50,9 @@ func (ctx *pushCtx) InsertRows() error { tag := &r.Tags[j] ic.AddLabel(tag.Key, tag.Value) } - ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, r.Value) + if err := ic.WriteDataPoint(at, ic.Labels, r.Timestamp, r.Value); err != nil { + return err + } } rowsInserted.Add(len(rows)) return ic.FlushBufs() @@ -110,7 +113,7 @@ func (ctx *pushCtx) Read(r io.Reader) bool { type pushCtx struct { Rows Rows - Common common.InsertCtx + Common netstorage.InsertCtx reqBuf bytesutil.ByteBuffer tailBuf []byte @@ -128,7 +131,7 @@ func (ctx *pushCtx) Error() error { func (ctx *pushCtx) reset() { ctx.Rows.Reset() - ctx.Common.Reset(0) + ctx.Common.Reset() ctx.reqBuf.Reset() ctx.tailBuf = ctx.tailBuf[:0] diff --git a/app/vminsert/graphite/server.go b/app/vminsert/graphite/server.go index 75879853b..38abb5afe 100644 --- a/app/vminsert/graphite/server.go +++ b/app/vminsert/graphite/server.go @@ -7,6 +7,7 @@ import ( "sync" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/metrics" @@ -70,7 +71,8 @@ func serveTCP(ln net.Listener) { } go func() { writeRequestsTCP.Inc() - if err := insertHandler(c); err != nil { + var at auth.Token // TODO: properly initialize auth token + if err := insertHandler(&at, c); err != nil { writeErrorsTCP.Inc() logger.Errorf("error in TCP Graphite conn %q<->%q: %s", c.LocalAddr(), c.RemoteAddr(), err) } @@ -88,6 +90,7 @@ func serveUDP(ln net.PacketConn) { defer wg.Done() var bb bytesutil.ByteBuffer bb.B = bytesutil.Resize(bb.B, 64*1024) + var at auth.Token // TODO: properly initialize auth token for { bb.Reset() bb.B = bb.B[:cap(bb.B)] @@ -108,7 +111,7 @@ func serveUDP(ln net.PacketConn) { } bb.B = bb.B[:n] writeRequestsUDP.Inc() - if err := insertHandler(bb.NewReader()); err != nil { + if err := insertHandler(&at, bb.NewReader()); err != nil { writeErrorsUDP.Inc() logger.Errorf("error in UDP Graphite conn %q<->%q: %s", ln.LocalAddr(), addr, err) continue diff --git a/app/vminsert/influx/request_handler.go b/app/vminsert/influx/request_handler.go index 2c713f66c..773f2b94f 100644 --- a/app/vminsert/influx/request_handler.go +++ b/app/vminsert/influx/request_handler.go @@ -10,8 +10,9 @@ import ( "sync" "time" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" "github.com/VictoriaMetrics/metrics" @@ -22,13 +23,13 @@ var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="influx"}`) // InsertHandler processes remote write for influx line protocol. // // See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md -func InsertHandler(req *http.Request) error { +func InsertHandler(at *auth.Token, req *http.Request) error { return concurrencylimiter.Do(func() error { - return insertHandlerInternal(req) + return insertHandlerInternal(at, req) }) } -func insertHandlerInternal(req *http.Request) error { +func insertHandlerInternal(at *auth.Token, req *http.Request) error { influxReadCalls.Inc() r := req.Body @@ -64,21 +65,17 @@ func insertHandlerInternal(req *http.Request) error { ctx := getPushCtx() defer putPushCtx(ctx) for ctx.Read(r, tsMultiplier) { - if err := ctx.InsertRows(db); err != nil { + if err := ctx.InsertRows(at, db); err != nil { return err } } return ctx.Error() } -func (ctx *pushCtx) InsertRows(db string) error { +func (ctx *pushCtx) InsertRows(at *auth.Token, db string) error { rows := ctx.Rows.Rows - rowsLen := 0 - for i := range rows { - rowsLen += len(rows[i].Tags) - } ic := &ctx.Common - ic.Reset(rowsLen) + ic.Reset() for i := range rows { r := &rows[i] ic.Labels = ic.Labels[:0] @@ -87,17 +84,25 @@ func (ctx *pushCtx) InsertRows(db string) error { tag := &r.Tags[j] ic.AddLabel(tag.Key, tag.Value) } - ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels) + ic.MetricNameBuf = storage.MarshalMetricNameRaw(ic.MetricNameBuf[:0], at.AccountID, at.ProjectID, ic.Labels) + metricNameBufLen := len(ic.MetricNameBuf) ctx.metricGroupBuf = append(ctx.metricGroupBuf[:0], r.Measurement...) ctx.metricGroupBuf = append(ctx.metricGroupBuf, '.') metricGroupPrefixLen := len(ctx.metricGroupBuf) + ic.Labels = ic.Labels[:0] + ic.AddLabel("", "placeholder") + placeholderLabel := &ic.Labels[len(ic.Labels)-1] for j := range r.Fields { f := &r.Fields[j] ctx.metricGroupBuf = append(ctx.metricGroupBuf[:metricGroupPrefixLen], f.Key...) metricGroup := bytesutil.ToUnsafeString(ctx.metricGroupBuf) - ic.Labels = ic.Labels[:0] + ic.Labels = ic.Labels[:len(ic.Labels)-1] ic.AddLabel("", metricGroup) - ic.WriteDataPoint(ctx.metricNameBuf, ic.Labels[:1], r.Timestamp, f.Value) + ic.MetricNameBuf = storage.MarshalMetricLabelRaw(ic.MetricNameBuf[:metricNameBufLen], placeholderLabel) + storageNodeIdx := ic.GetStorageNodeIdx(at, ic.Labels) + if err := ic.WriteDataPointExt(at, storageNodeIdx, ic.MetricNameBuf, r.Timestamp, f.Value); err != nil { + return err + } } rowsInserted.Add(len(r.Fields)) } @@ -189,12 +194,11 @@ var ( type pushCtx struct { Rows Rows - Common common.InsertCtx + Common netstorage.InsertCtx reqBuf bytesutil.ByteBuffer tailBuf []byte copyBuf [16 * 1024]byte - metricNameBuf []byte metricGroupBuf []byte err error @@ -209,11 +213,9 @@ func (ctx *pushCtx) Error() error { func (ctx *pushCtx) reset() { ctx.Rows.Reset() - ctx.Common.Reset(0) - + ctx.Common.Reset() ctx.reqBuf.Reset() ctx.tailBuf = ctx.tailBuf[:0] - ctx.metricNameBuf = ctx.metricNameBuf[:0] ctx.metricGroupBuf = ctx.metricGroupBuf[:0] ctx.err = nil diff --git a/app/vminsert/main.go b/app/vminsert/main.go index eff463b81..944b67f0e 100644 --- a/app/vminsert/main.go +++ b/app/vminsert/main.go @@ -1,68 +1,119 @@ -package vminsert +package main import ( "flag" "fmt" "net/http" - "strings" + "time" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/influx" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/opentsdb" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/prometheus" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" "github.com/VictoriaMetrics/metrics" ) var ( graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty") opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB put messages. Usually :4242 must be set. Doesn't work if empty") + httpListenAddr = flag.String("httpListenAddr", ":8480", "Address to listen for http connections") maxInsertRequestSize = flag.Int("maxInsertRequestSize", 32*1024*1024, "The maximum size of a single insert request in bytes") + storageNodes flagutil.Array ) -// Init initializes vminsert. -func Init() { +func main() { + flag.Var(&storageNodes, "storageNode", "Vmstorage address, usage -storageNode=vmstorage-host1:8400 -storageNode=vmstorage-host2:8400") + flag.Parse() + buildinfo.Init() + logger.Init() + + logger.Infof("initializing netstorage for storageNodes=%v...", storageNodes) + startTime := time.Now() + if len(storageNodes) == 0 { + logger.Fatalf("storageNodes cannot be empty") + } + netstorage.InitStorageNodes(storageNodes) + logger.Infof("successfully initialized netstorage in %s", time.Since(startTime)) + if len(*graphiteListenAddr) > 0 { go graphite.Serve(*graphiteListenAddr) } if len(*opentsdbListenAddr) > 0 { go opentsdb.Serve(*opentsdbListenAddr) } -} -// Stop stops vminsert. -func Stop() { + go func() { + httpserver.Serve(*httpListenAddr, requestHandler) + }() + + sig := procutil.WaitForSigterm() + logger.Infof("service received signal %s", sig) + + logger.Infof("gracefully shutting down the service at %q", *httpListenAddr) + startTime = time.Now() + if err := httpserver.Stop(*httpListenAddr); err != nil { + logger.Fatalf("cannot stop the service: %s", err) + } + logger.Infof("successfully shut down the service in %s", time.Since(startTime)) + if len(*graphiteListenAddr) > 0 { graphite.Stop() } if len(*opentsdbListenAddr) > 0 { opentsdb.Stop() } + + logger.Infof("shutting down neststorage...") + startTime = time.Now() + netstorage.Stop() + logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) + + logger.Infof("the vminsert has been stopped") } -// RequestHandler is a handler for Prometheus remote storage write API -func RequestHandler(w http.ResponseWriter, r *http.Request) bool { - path := strings.Replace(r.URL.Path, "//", "/", -1) - switch path { - case "/api/v1/write": +func requestHandler(w http.ResponseWriter, r *http.Request) bool { + p, err := httpserver.ParsePath(r.URL.Path) + if err != nil { + httpserver.Errorf(w, "cannot parse path %q: %s", r.URL.Path, err) + return true + } + if p.Prefix != "insert" { + // This is not our link. + return false + } + at, err := auth.NewToken(p.AuthToken) + if err != nil { + httpserver.Errorf(w, "auth error: %s", err) + return true + } + + switch p.Suffix { + case "prometheus/", "prometheus": prometheusWriteRequests.Inc() - if err := prometheus.InsertHandler(r, int64(*maxInsertRequestSize)); err != nil { + if err := prometheus.InsertHandler(at, r, int64(*maxInsertRequestSize)); err != nil { prometheusWriteErrors.Inc() httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true } w.WriteHeader(http.StatusNoContent) return true - case "/write", "/api/v2/write": + case "influx/write", "influx/api/v2/write": influxWriteRequests.Inc() - if err := influx.InsertHandler(r); err != nil { + if err := influx.InsertHandler(at, r); err != nil { influxWriteErrors.Inc() httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true } w.WriteHeader(http.StatusNoContent) return true - case "/query": + case "influx/query": // Emulate fake response for influx query influxQueryRequests.Inc() fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`) @@ -74,11 +125,11 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } var ( - prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="prometheus"}`) - prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="prometheus"}`) + prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/prometheus/", protocol="prometheus"}`) + prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/insert/{}/prometheus/", protocol="prometheus"}`) - influxWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/write", protocol="influx"}`) - influxWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/write", protocol="influx"}`) + influxWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/influx/", protocol="influx"}`) + influxWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/insert/{}/influx/", protocol="influx"}`) - influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/query", protocol="influx"}`) + influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/influx/query", protocol="influx"}`) ) diff --git a/app/vminsert/netstorage/insert_ctx.go b/app/vminsert/netstorage/insert_ctx.go new file mode 100644 index 000000000..738e52f05 --- /dev/null +++ b/app/vminsert/netstorage/insert_ctx.go @@ -0,0 +1,134 @@ +package netstorage + +import ( + "fmt" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/consts" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" + xxhash "github.com/cespare/xxhash/v2" + jump "github.com/lithammer/go-jump-consistent-hash" +) + +// InsertCtx is a generic context for inserting data +type InsertCtx struct { + Labels []prompb.Label + MetricNameBuf []byte + + bufs [][]byte + labelsBuf []byte + sizeBuf [8]byte +} + +// Reset resets ctx. +func (ctx *InsertCtx) Reset() { + for _, label := range ctx.Labels { + label.Name = nil + label.Value = nil + } + ctx.Labels = ctx.Labels[:0] + ctx.MetricNameBuf = ctx.MetricNameBuf[:0] + + if ctx.bufs == nil { + ctx.bufs = make([][]byte, len(storageNodes)) + } + for i := range ctx.bufs { + ctx.bufs[i] = ctx.bufs[i][:0] + } + ctx.labelsBuf = ctx.labelsBuf[:0] +} + +// AddLabel adds (name, value) label to ctx.Labels. +// +// name and value must exist until ctx.Labels is used. +func (ctx *InsertCtx) AddLabel(name, value string) { + labels := ctx.Labels + if cap(labels) > len(labels) { + labels = labels[:len(labels)+1] + } else { + labels = append(labels, prompb.Label{}) + } + label := &labels[len(labels)-1] + + // Do not copy name and value contents for performance reasons. + // This reduces GC overhead on the number of objects and allocations. + label.Name = bytesutil.ToUnsafeBytes(name) + label.Value = bytesutil.ToUnsafeBytes(value) + + ctx.Labels = labels +} + +// WriteDataPoint writes (timestamp, value) data point with the given at and labels to ctx buffer. +func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, timestamp int64, value float64) error { + ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, labels) + storageNodeIdx := ctx.GetStorageNodeIdx(at, labels) + return ctx.WriteDataPointExt(at, storageNodeIdx, ctx.MetricNameBuf, timestamp, value) +} + +// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx. +func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error { + buf := ctx.bufs[storageNodeIdx] + sn := storageNodes[storageNodeIdx] + bufNew := storage.MarshalMetricRow(buf, metricNameRaw, timestamp, value) + if len(bufNew) >= consts.MaxInsertPacketSize { + // Send buf to storageNode, since it is too big. + if err := sn.sendWithFallback(buf, ctx.sizeBuf[:]); err != nil { + return fmt.Errorf("cannot send %d bytes to storageNodes: %s", len(buf), err) + } + buf = storage.MarshalMetricRow(bufNew[:0], metricNameRaw, timestamp, value) + } else { + buf = bufNew + } + ctx.bufs[storageNodeIdx] = buf + sn.RowsPushed.Inc() + return nil +} + +// FlushBufs flushes ctx bufs to remote storage nodes. +func (ctx *InsertCtx) FlushBufs() error { + // Send per-storageNode bufs. + sizeBuf := ctx.sizeBuf[:] + for i, buf := range ctx.bufs { + if len(buf) == 0 { + continue + } + sn := storageNodes[i] + if err := sn.sendWithFallback(buf, sizeBuf); err != nil { + return fmt.Errorf("cannot send data to storageNodes: %s", err) + } + } + return nil +} + +// GetStorageNodeIdx returns storage node index for the given at and labels. +// +// The returned index must be passed to WriteDataPoint. +func (ctx *InsertCtx) GetStorageNodeIdx(at *auth.Token, labels []prompb.Label) int { + if len(storageNodes) == 1 { + // Fast path - only a single storage node. + return 0 + } + + buf := ctx.labelsBuf[:0] + buf = encoding.MarshalUint32(buf, at.AccountID) + buf = encoding.MarshalUint32(buf, at.ProjectID) + for i := range labels { + label := &labels[i] + buf = marshalBytesFast(buf, label.Name) + buf = marshalBytesFast(buf, label.Value) + } + h := xxhash.Sum64(buf) + ctx.labelsBuf = buf + + idx := int(jump.Hash(h, int32(len(storageNodes)))) + return idx +} + +func marshalBytesFast(dst []byte, s []byte) []byte { + dst = encoding.MarshalUint16(dst, uint16(len(s))) + dst = append(dst, s...) + return dst +} diff --git a/app/vminsert/netstorage/netstorage.go b/app/vminsert/netstorage/netstorage.go new file mode 100644 index 000000000..1ef785172 --- /dev/null +++ b/app/vminsert/netstorage/netstorage.go @@ -0,0 +1,215 @@ +package netstorage + +import ( + "fmt" + "sync" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" + "github.com/VictoriaMetrics/metrics" +) + +// sendWithFallback sends buf to storage node sn. +// +// It falls back to sending data to another storage node if sn is currently +// unavailable. +func (sn *storageNode) sendWithFallback(buf []byte, sizeBuf []byte) error { + deadline := time.Now().Add(30 * time.Second) + err := sn.sendBuf(buf, deadline, sizeBuf) + if err == nil { + return nil + } + + // Failed to send the data to sn. Try sending it to another storageNodes. + if time.Until(deadline) <= 0 { + sn.timeouts.Inc() + return err + } + if len(storageNodes) == 1 { + return err + } + idx := func() int { + for i, snOther := range storageNodes { + if sn == snOther { + return i + } + } + logger.Panicf("BUG: cannot find storageNode %p in storageNodes %p", sn, storageNodes) + return -1 + }() + for i := 0; i < len(storageNodes); i++ { + idx++ + if idx >= len(storageNodes) { + idx = 0 + } + err = storageNodes[idx].sendBuf(buf, deadline, sizeBuf) + if err == nil { + storageNodes[idx].fallbacks.Inc() + return nil + } + if time.Until(deadline) <= 0 { + sn.timeouts.Inc() + return err + } + } + return err +} + +func (sn *storageNode) sendBuf(buf []byte, deadline time.Time, sizeBuf []byte) error { + // sizeBuf guarantees that the rows batch will be either fully + // read or fully discarded on the vmstorage. + // sizeBuf is used for read optimization in vmstorage. + encoding.MarshalUint64(sizeBuf[:0], uint64(len(buf))) + + sn.bcLock.Lock() + defer sn.bcLock.Unlock() + + if sn.bc == nil { + if err := sn.dial(); err != nil { + return fmt.Errorf("cannot dial %q: %s", sn.dialer.Addr(), err) + } + } + + if err := sn.sendBufNolock(buf, deadline, sizeBuf); err != nil { + sn.closeConn() + return err + } + return nil +} + +func (sn *storageNode) sendBufNolock(buf []byte, deadline time.Time, sizeBuf []byte) error { + if err := sn.bc.SetWriteDeadline(deadline); err != nil { + return fmt.Errorf("cannot set write deadline to %s: %s", deadline, err) + } + if _, err := sn.bc.Write(sizeBuf); err != nil { + return fmt.Errorf("cannot write data size %d: %s", len(buf), err) + } + if _, err := sn.bc.Write(buf); err != nil { + return fmt.Errorf("cannot write data: %s", err) + } + return nil +} + +func (sn *storageNode) dial() error { + c, err := sn.dialer.Dial() + if err != nil { + sn.dialErrors.Inc() + return err + } + + compressionLevel := 1 + bc, err := handshake.VMInsertClient(c, compressionLevel) + if err != nil { + _ = c.Close() + sn.handshakeErrors.Inc() + return fmt.Errorf("handshake error: %s", err) + } + + sn.bc = bc + return nil +} + +func (sn *storageNode) closeConn() { + _ = sn.bc.Close() + sn.bc = nil + sn.connectionErrors.Inc() +} + +func (sn *storageNode) run() { + mustStop := false + for !mustStop { + select { + case <-stopCh: + mustStop = true + case <-time.After(time.Second): + } + + sn.bcLock.Lock() + if err := sn.flushNolock(); err != nil { + sn.closeConn() + logger.Errorf("cannot flush data to storageNode %q: %s", sn.dialer.Addr(), err) + } + sn.bcLock.Unlock() + } +} + +func (sn *storageNode) flushNolock() error { + if sn.bc == nil { + return nil + } + if err := sn.bc.SetWriteDeadline(time.Now().Add(30 * time.Second)); err != nil { + return fmt.Errorf("cannot set write deadline: %s", err) + } + return sn.bc.Flush() +} + +// storageNode is a client sending data to storage node. +type storageNode struct { + dialer *netutil.TCPDialer + + bc *handshake.BufferedConn + bcLock sync.Mutex + + // The number of times the storage node was timed out (overflown). + timeouts *metrics.Counter + + // The number of dial errors to storage node. + dialErrors *metrics.Counter + + // The number of handshake errors to storage node. + handshakeErrors *metrics.Counter + + // The number of connection errors to storage node. + connectionErrors *metrics.Counter + + // The number of fallbacks to this node. + fallbacks *metrics.Counter + + // The number of rows pushed to storage node. + RowsPushed *metrics.Counter +} + +// storageNodes contains a list of storage node clients. +var storageNodes []*storageNode + +var storageNodesWG sync.WaitGroup + +var stopCh = make(chan struct{}) + +// InitStorageNodes initializes storage nodes' connections to the given addrs. +func InitStorageNodes(addrs []string) { + if len(addrs) == 0 { + logger.Panicf("BUG: addrs must be non-empty") + } + if len(addrs) > 255 { + logger.Panicf("BUG: too much addresses: %d; max supported %d addresses", len(addrs), 255) + } + + for _, addr := range addrs { + sn := &storageNode{ + dialer: netutil.NewTCPDialer("vminsert", addr), + + timeouts: metrics.NewCounter(fmt.Sprintf(`vm_rpc_timeouts_total{name="vminsert", addr=%q}`, addr)), + dialErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_dial_errors_total{name="vminsert", addr=%q}`, addr)), + handshakeErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_handshake_errors_total{name="vminsert", addr=%q}`, addr)), + connectionErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_connection_errors_total{name="vminsert", addr=%q}`, addr)), + fallbacks: metrics.NewCounter(fmt.Sprintf(`vm_rpc_fallbacks_total{name="vminsert", addr=%q}`, addr)), + RowsPushed: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_pushed_total{name="vminsert", addr=%q}`, addr)), + } + storageNodes = append(storageNodes, sn) + storageNodesWG.Add(1) + go func(addr string) { + sn.run() + storageNodesWG.Done() + }(addr) + } +} + +// Stop gracefully stops netstorage. +func Stop() { + close(stopCh) + storageNodesWG.Wait() +} diff --git a/app/vminsert/opentsdb/request_handler.go b/app/vminsert/opentsdb/request_handler.go index eef981a5f..8eff6a95e 100644 --- a/app/vminsert/opentsdb/request_handler.go +++ b/app/vminsert/opentsdb/request_handler.go @@ -9,8 +9,9 @@ import ( "sync" "time" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/metrics" ) @@ -20,27 +21,27 @@ var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="opentsdb"}`) // insertHandler processes remote write for OpenTSDB put protocol. // // See http://opentsdb.net/docs/build/html/api_telnet/put.html -func insertHandler(r io.Reader) error { +func insertHandler(at *auth.Token, r io.Reader) error { return concurrencylimiter.Do(func() error { - return insertHandlerInternal(r) + return insertHandlerInternal(at, r) }) } -func insertHandlerInternal(r io.Reader) error { +func insertHandlerInternal(at *auth.Token, r io.Reader) error { ctx := getPushCtx() defer putPushCtx(ctx) for ctx.Read(r) { - if err := ctx.InsertRows(); err != nil { + if err := ctx.InsertRows(at); err != nil { return err } } return ctx.Error() } -func (ctx *pushCtx) InsertRows() error { +func (ctx *pushCtx) InsertRows(at *auth.Token) error { rows := ctx.Rows.Rows ic := &ctx.Common - ic.Reset(len(rows)) + ic.Reset() for i := range rows { r := &rows[i] ic.Labels = ic.Labels[:0] @@ -49,7 +50,9 @@ func (ctx *pushCtx) InsertRows() error { tag := &r.Tags[j] ic.AddLabel(tag.Key, tag.Value) } - ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, r.Value) + if err := ic.WriteDataPoint(at, ic.Labels, r.Timestamp, r.Value); err != nil { + return err + } } rowsInserted.Add(len(rows)) return ic.FlushBufs() @@ -110,7 +113,7 @@ func (ctx *pushCtx) Read(r io.Reader) bool { type pushCtx struct { Rows Rows - Common common.InsertCtx + Common netstorage.InsertCtx reqBuf bytesutil.ByteBuffer tailBuf []byte @@ -128,7 +131,7 @@ func (ctx *pushCtx) Error() error { func (ctx *pushCtx) reset() { ctx.Rows.Reset() - ctx.Common.Reset(0) + ctx.Common.Reset() ctx.reqBuf.Reset() ctx.tailBuf = ctx.tailBuf[:0] diff --git a/app/vminsert/opentsdb/server.go b/app/vminsert/opentsdb/server.go index b7f37a0ce..d39f524b3 100644 --- a/app/vminsert/opentsdb/server.go +++ b/app/vminsert/opentsdb/server.go @@ -7,6 +7,7 @@ import ( "sync" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/metrics" @@ -70,7 +71,8 @@ func serveTCP(ln net.Listener) { } go func() { writeRequestsTCP.Inc() - if err := insertHandler(c); err != nil { + var at auth.Token // TODO: properly initialize the auth token + if err := insertHandler(&at, c); err != nil { writeErrorsTCP.Inc() logger.Errorf("error in TCP OpenTSDB conn %q<->%q: %s", c.LocalAddr(), c.RemoteAddr(), err) } @@ -88,6 +90,7 @@ func serveUDP(ln net.PacketConn) { defer wg.Done() var bb bytesutil.ByteBuffer bb.B = bytesutil.Resize(bb.B, 64*1024) + var at auth.Token // TODO: properly initialize the auth token for { bb.Reset() bb.B = bb.B[:cap(bb.B)] @@ -108,7 +111,7 @@ func serveUDP(ln net.PacketConn) { } bb.B = bb.B[:n] writeRequestsUDP.Inc() - if err := insertHandler(bb.NewReader()); err != nil { + if err := insertHandler(&at, bb.NewReader()); err != nil { writeErrorsUDP.Inc() logger.Errorf("error in UDP OpenTSDB conn %q<->%q: %s", ln.LocalAddr(), addr, err) continue diff --git a/app/vminsert/prometheus/request_handler.go b/app/vminsert/prometheus/request_handler.go index ab544afac..e34a23bb5 100644 --- a/app/vminsert/prometheus/request_handler.go +++ b/app/vminsert/prometheus/request_handler.go @@ -6,40 +6,45 @@ import ( "runtime" "sync" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common" "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" "github.com/VictoriaMetrics/metrics" ) var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="prometheus"}`) // InsertHandler processes remote write for prometheus. -func InsertHandler(r *http.Request, maxSize int64) error { +func InsertHandler(at *auth.Token, r *http.Request, maxSize int64) error { return concurrencylimiter.Do(func() error { - return insertHandlerInternal(r, maxSize) + return insertHandlerInternal(at, r, maxSize) }) } -func insertHandlerInternal(r *http.Request, maxSize int64) error { +func insertHandlerInternal(at *auth.Token, r *http.Request, maxSize int64) error { ctx := getPushCtx() defer putPushCtx(ctx) if err := ctx.Read(r, maxSize); err != nil { return err } - timeseries := ctx.req.Timeseries - rowsLen := 0 - for i := range timeseries { - rowsLen += len(timeseries[i].Samples) - } + ic := &ctx.Common - ic.Reset(rowsLen) + ic.Reset() + timeseries := ctx.req.Timeseries for i := range timeseries { ts := ×eries[i] - var metricNameRaw []byte + storageNodeIdx := ic.GetStorageNodeIdx(at, ts.Labels) + ic.MetricNameBuf = ic.MetricNameBuf[:0] for i := range ts.Samples { r := &ts.Samples[i] - metricNameRaw = ic.WriteDataPointExt(metricNameRaw, ts.Labels, r.Timestamp, r.Value) + if len(ic.MetricNameBuf) == 0 { + ic.MetricNameBuf = storage.MarshalMetricNameRaw(ic.MetricNameBuf[:0], at.AccountID, at.ProjectID, ts.Labels) + } + if err := ic.WriteDataPointExt(at, storageNodeIdx, ic.MetricNameBuf, r.Timestamp, r.Value); err != nil { + return err + } } rowsInserted.Add(len(ts.Samples)) } @@ -47,14 +52,14 @@ func insertHandlerInternal(r *http.Request, maxSize int64) error { } type pushCtx struct { - Common common.InsertCtx + Common netstorage.InsertCtx req prompb.WriteRequest reqBuf []byte } func (ctx *pushCtx) reset() { - ctx.Common.Reset(0) + ctx.Common.Reset() ctx.req.Reset() ctx.reqBuf = ctx.reqBuf[:0] } diff --git a/app/vmselect/Makefile b/app/vmselect/Makefile new file mode 100644 index 000000000..c175ba74c --- /dev/null +++ b/app/vmselect/Makefile @@ -0,0 +1,32 @@ +# All these commands must run from repository root. + +run-vmselect: + mkdir -p vmselect-cache + DOCKER_OPTS='-v $(shell pwd)/vmselect-cache:/cache -p 8481:8481' \ + APP_NAME=vmselect \ + ARGS='-storageNode=localhost:8401 -selectNode=localhost:8481 -cacheDataPath=/cache' \ + $(MAKE) run-via-docker + +vmselect: + APP_NAME=vmselect $(MAKE) app-local + +vmselect-race: + APP_NAME=vmselect RACE=-race $(MAKE) app-local + +vmselect-prod: + APP_NAME=vmselect $(MAKE) app-via-docker + +vmselect-prod-race: + APP_NAME=vmselect RACE=-race $(MAKE) app-via-docker + +package-vmselect: + APP_NAME=vmselect $(MAKE) package-via-docker + +package-vmselect-race: + APP_NAME=vmselect RACE=-race $(MAKE) package-via-docker + +publish-vmselect: + APP_NAME=vmselect $(MAKE) publish-via-docker + +publish-vmselect-race: + APP_NAME=vmselect RACE=-race $(MAKE) publish-via-docker diff --git a/app/vmselect/README.md b/app/vmselect/README.md index 1335e9407..1d1ed69cf 100644 --- a/app/vmselect/README.md +++ b/app/vmselect/README.md @@ -1,2 +1,6 @@ -`vmselect` performs the incoming queries and fetches the required data -from `vmstorage`. +`vmselect` performs the following tasks: + +- Splits incoming selects to tasks for `vmstorage` nodes and issues these tasks + to all the `vmstorage` nodes in the cluster. + +- Merges responses from all the `vmstorage` nodes and returns a single response. diff --git a/app/vmselect/deployment/Dockerfile b/app/vmselect/deployment/Dockerfile new file mode 100644 index 000000000..bbaef9028 --- /dev/null +++ b/app/vmselect/deployment/Dockerfile @@ -0,0 +1,5 @@ +FROM scratch +COPY --from=local/certs:1.0.2 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt +COPY bin/vmselect-prod . +EXPOSE 8481 +ENTRYPOINT ["/vmselect-prod"] diff --git a/app/vmselect/main.go b/app/vmselect/main.go index 4561abec4..9f344f05a 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -1,4 +1,4 @@ -package vmselect +package main import ( "flag" @@ -10,37 +10,78 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" "github.com/VictoriaMetrics/metrics" ) var ( - deleteAuthKey = flag.String("deleteAuthKey", "", "authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series") + httpListenAddr = flag.String("httpListenAddr", ":8481", "Address to listen for http connections") + cacheDataPath = flag.String("cacheDataPath", "", "Path to directory for cache files. Cache isn't saved if empty") maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", runtime.GOMAXPROCS(-1)*2, "The maximum number of concurrent search requests. It shouldn't exceed 2*vCPUs for better performance. See also -search.maxQueueDuration") maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached") + + storageNodes flagutil.Array ) -// Init initializes vmselect -func Init() { - tmpDirPath := *vmstorage.DataPath + "/tmp" - fs.RemoveDirContents(tmpDirPath) - netstorage.InitTmpBlocksDir(tmpDirPath) - promql.InitRollupResultCache(*vmstorage.DataPath + "/cache/rollupResult") +func main() { + flag.Var(&storageNodes, "storageNode", "Vmstorage address, usage -storageNode=vmstorage-host1:8401 -storageNode=vmstorage-host2:8401") + flag.Parse() + buildinfo.Init() + logger.Init() + + logger.Infof("starting netstorage at storageNodes=%v", storageNodes) + startTime := time.Now() + if len(storageNodes) == 0 { + logger.Fatalf("storageNodes cannot be empty") + } + netstorage.InitStorageNodes(storageNodes) + logger.Infof("started netstorage in %s", time.Since(startTime)) + + if len(*cacheDataPath) > 0 { + tmpDataPath := *cacheDataPath + "/tmp" + fs.RemoveDirContents(tmpDataPath) + netstorage.InitTmpBlocksDir(tmpDataPath) + promql.InitRollupResultCache(*cacheDataPath + "/rollupResult") + } else { + netstorage.InitTmpBlocksDir("") + promql.InitRollupResultCache("") + } concurrencyCh = make(chan struct{}, *maxConcurrentRequests) + + go func() { + httpserver.Serve(*httpListenAddr, requestHandler) + }() + + sig := procutil.WaitForSigterm() + logger.Infof("service received signal %s", sig) + + logger.Infof("gracefully shutting down the service at %q", *httpListenAddr) + startTime = time.Now() + if err := httpserver.Stop(*httpListenAddr); err != nil { + logger.Fatalf("cannot stop the service: %s", err) + } + logger.Infof("successfully shut down the service in %s", time.Since(startTime)) + + logger.Infof("shutting down neststorage...") + startTime = time.Now() + netstorage.Stop() + if len(*cacheDataPath) > 0 { + promql.StopRollupResultCache() + } + logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) + + logger.Infof("the vmselect has been stopped") } var concurrencyCh chan struct{} -// Stop stops vmselect -func Stop() { - promql.StopRollupResultCache() -} - -// RequestHandler handles remote read API requests for Prometheus -func RequestHandler(w http.ResponseWriter, r *http.Request) bool { +func requestHandler(w http.ResponseWriter, r *http.Request) bool { // Limit the number of concurrent queries. // Sleep for a second until giving up. This should resolve short bursts in requests. t := time.NewTimer(*maxQueueDuration) @@ -53,14 +94,41 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true } - path := strings.Replace(r.URL.Path, "//", "/", -1) - if strings.HasPrefix(path, "/api/v1/label/") { - s := r.URL.Path[len("/api/v1/label/"):] + path := r.URL.Path + if path == "/internal/resetRollupResultCache" { + promql.ResetRollupResultCache() + return true + } + + p, err := httpserver.ParsePath(path) + if err != nil { + httpserver.Errorf(w, "cannot parse path %q: %s", path, err) + return true + } + at, err := auth.NewToken(p.AuthToken) + if err != nil { + httpserver.Errorf(w, "auth error: %s", err) + return true + } + switch p.Prefix { + case "select": + return selectHandler(w, r, p, at) + case "delete": + return deleteHandler(w, r, p, at) + default: + // This is not our link + return false + } +} + +func selectHandler(w http.ResponseWriter, r *http.Request, p *httpserver.Path, at *auth.Token) bool { + if strings.HasPrefix(p.Suffix, "prometheus/api/v1/label/") { + s := p.Suffix[len("prometheus/api/v1/label/"):] if strings.HasSuffix(s, "/values") { labelValuesRequests.Inc() labelName := s[:len(s)-len("/values")] httpserver.EnableCORS(w, r) - if err := prometheus.LabelValuesHandler(labelName, w, r); err != nil { + if err := prometheus.LabelValuesHandler(at, labelName, w, r); err != nil { labelValuesErrors.Inc() sendPrometheusError(w, r, err) return true @@ -69,76 +137,78 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } } - switch path { - case "/api/v1/query": + switch p.Suffix { + case "prometheus/api/v1/query": queryRequests.Inc() httpserver.EnableCORS(w, r) - if err := prometheus.QueryHandler(w, r); err != nil { + if err := prometheus.QueryHandler(at, w, r); err != nil { queryErrors.Inc() sendPrometheusError(w, r, err) return true } return true - case "/api/v1/query_range": + case "prometheus/api/v1/query_range": queryRangeRequests.Inc() httpserver.EnableCORS(w, r) - if err := prometheus.QueryRangeHandler(w, r); err != nil { + if err := prometheus.QueryRangeHandler(at, w, r); err != nil { queryRangeErrors.Inc() sendPrometheusError(w, r, err) return true } return true - case "/api/v1/series": + case "prometheus/api/v1/series": seriesRequests.Inc() httpserver.EnableCORS(w, r) - if err := prometheus.SeriesHandler(w, r); err != nil { + if err := prometheus.SeriesHandler(at, w, r); err != nil { seriesErrors.Inc() sendPrometheusError(w, r, err) return true } return true - case "/api/v1/series/count": + case "prometheus/api/v1/series/count": seriesCountRequests.Inc() httpserver.EnableCORS(w, r) - if err := prometheus.SeriesCountHandler(w, r); err != nil { + if err := prometheus.SeriesCountHandler(at, w, r); err != nil { seriesCountErrors.Inc() sendPrometheusError(w, r, err) return true } return true - case "/api/v1/labels": + case "prometheus/api/v1/labels": labelsRequests.Inc() httpserver.EnableCORS(w, r) - if err := prometheus.LabelsHandler(w, r); err != nil { + if err := prometheus.LabelsHandler(at, w, r); err != nil { labelsErrors.Inc() sendPrometheusError(w, r, err) return true } return true - case "/api/v1/export": + case "prometheus/api/v1/export": exportRequests.Inc() - if err := prometheus.ExportHandler(w, r); err != nil { + if err := prometheus.ExportHandler(at, w, r); err != nil { exportErrors.Inc() httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true } return true - case "/federate": + case "prometheus/federate": federateRequests.Inc() - if err := prometheus.FederateHandler(w, r); err != nil { + if err := prometheus.FederateHandler(at, w, r); err != nil { federateErrors.Inc() - httpserver.Errorf(w, "error int %q: %s", r.URL.Path, err) + httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true } return true - case "/api/v1/admin/tsdb/delete_series": + default: + return false + } +} + +func deleteHandler(w http.ResponseWriter, r *http.Request, p *httpserver.Path, at *auth.Token) bool { + switch p.Suffix { + case "prometheus/api/v1/admin/tsdb/delete_series": deleteRequests.Inc() - authKey := r.FormValue("authKey") - if authKey != *deleteAuthKey { - httpserver.Errorf(w, "invalid authKey %q. It must match the value from -deleteAuthKey command line flag", authKey) - return true - } - if err := prometheus.DeleteHandler(r); err != nil { + if err := prometheus.DeleteHandler(at, r); err != nil { deleteErrors.Inc() httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true @@ -160,30 +230,30 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) { } var ( - labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/label/{}/values"}`) - labelValuesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/label/{}/values"}`) + labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/label/{}/values"}`) + labelValuesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="select/{}/prometheus/api/v1/label/{}/values"}`) - queryRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/query"}`) - queryErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/query"}`) + queryRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/query"}`) + queryErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/query"}`) - queryRangeRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/query_range"}`) - queryRangeErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/query_range"}`) + queryRangeRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/prometheus/api/v1/query_range"}`) + queryRangeErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/query_range"}`) - seriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/series"}`) - seriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/series"}`) + seriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/series"}`) + seriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/series"}`) - seriesCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/series/count"}`) - seriesCountErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/series/count"}`) + seriesCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/series/count"}`) + seriesCountErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/series/count"}`) - labelsRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels"}`) - labelsErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels"}`) + labelsRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/labels"}`) + labelsErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/labels"}`) - deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`) - deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`) + deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`) + deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`) - exportRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/export"}`) - exportErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/export"}`) + exportRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/export"}`) + exportErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/export"}`) - federateRequests = metrics.NewCounter(`vm_http_requests_total{path="/federate"}`) - federateErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/federate"}`) + federateRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/federate"}`) + federateErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/federate"}`) ) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index f0a3cf728..022421251 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -2,28 +2,24 @@ package netstorage import ( "container/heap" - "flag" "fmt" + "io" "runtime" "sort" "sync" - "sync/atomic" "time" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" "github.com/VictoriaMetrics/metrics" ) -var ( - maxTagKeysPerSearch = flag.Int("search.maxTagKeys", 10e3, "The maximum number of tag keys returned per search") - maxTagValuesPerSearch = flag.Int("search.maxTagValues", 10e3, "The maximum number of tag values returned per search") - maxMetricsPerSearch = flag.Int("search.maxUniqueTimeseries", 100e3, "The maximum number of unique time series each search can scan") -) - // Result is a single timeseries result. // // ProcessSearchQuery returns Result slice. @@ -49,6 +45,7 @@ func (r *Result) reset() { // Results holds results returned from ProcessSearchQuery. type Results struct { + at *auth.Token tr storage.TimeRange deadline Deadline @@ -102,7 +99,7 @@ func (rss *Results) RunParallel(f func(rs *Result)) error { err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.Timeout) break } - if err = pts.Unpack(rss.tbf, rs, rss.tr, maxWorkersCount); err != nil { + if err = pts.Unpack(rss.tbf, rs, rss.tr, rss.at, maxWorkersCount); err != nil { break } if len(rs.Timestamps) == 0 { @@ -148,7 +145,7 @@ type packedTimeseries struct { } // Unpack unpacks pts to dst. -func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.TimeRange, maxWorkersCount int) error { +func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.TimeRange, at *auth.Token, maxWorkersCount int) error { dst.reset() if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil { @@ -175,7 +172,7 @@ func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage. var err error for addr := range workCh { sb := getSortBlock() - if err = sb.unpackFrom(tbf, addr, tr); err != nil { + if err = sb.unpackFrom(tbf, addr, tr, at); err != nil { break } @@ -294,7 +291,7 @@ func (sb *sortBlock) reset() { sb.NextIdx = 0 } -func (sb *sortBlock) unpackFrom(tbf *tmpBlocksFile, addr tmpBlockAddr, tr storage.TimeRange) error { +func (sb *sortBlock) unpackFrom(tbf *tmpBlocksFile, addr tmpBlockAddr, tr storage.TimeRange, at *auth.Token) error { tbf.MustReadBlockAt(&sb.b, addr) if err := sb.b.UnmarshalData(); err != nil { return fmt.Errorf("cannot unmarshal block: %s", err) @@ -352,21 +349,104 @@ func (sbh *sortBlocksHeap) Pop() interface{} { return v } -// DeleteSeries deletes time series matching the given tagFilterss. -func DeleteSeries(sq *storage.SearchQuery) (int, error) { - tfss, err := setupTfss(sq.TagFilterss) - if err != nil { - return 0, err +// DeleteSeries deletes time series matching the given sq. +func DeleteSeries(at *auth.Token, sq *storage.SearchQuery, deadline Deadline) (int, error) { + requestData := sq.Marshal(nil) + + // Send the query to all the storage nodes in parallel. + type nodeResult struct { + deletedCount int + err error } - return vmstorage.DeleteMetrics(tfss) + resultsCh := make(chan nodeResult, len(storageNodes)) + for _, sn := range storageNodes { + go func(sn *storageNode) { + sn.deleteSeriesRequests.Inc() + deletedCount, err := sn.deleteMetrics(requestData, deadline) + if err != nil { + sn.deleteSeriesRequestErrors.Inc() + } + resultsCh <- nodeResult{ + deletedCount: deletedCount, + err: err, + } + }(sn) + } + + // Collect results + deletedTotal := 0 + var errors []error + for i := 0; i < len(storageNodes); i++ { + // There is no need in timer here, since all the goroutines executing + // sn.deleteMetrics must be finished until the deadline. + nr := <-resultsCh + if nr.err != nil { + errors = append(errors, nr.err) + continue + } + deletedTotal += nr.deletedCount + } + if len(errors) > 0 { + // Return only the first error, since it has no sense in returning all errors. + return deletedTotal, fmt.Errorf("error occured during deleting time series: %s", errors[0]) + } + return deletedTotal, nil } // GetLabels returns labels until the given deadline. -func GetLabels(deadline Deadline) ([]string, error) { - labels, err := vmstorage.SearchTagKeys(*maxTagKeysPerSearch) - if err != nil { - return nil, fmt.Errorf("error during labels search: %s", err) +func GetLabels(at *auth.Token, deadline Deadline) ([]string, bool, error) { + // Send the query to all the storage nodes in parallel. + type nodeResult struct { + labels []string + err error } + resultsCh := make(chan nodeResult, len(storageNodes)) + for _, sn := range storageNodes { + go func(sn *storageNode) { + sn.labelsRequests.Inc() + labels, err := sn.getLabels(at.AccountID, at.ProjectID, deadline) + if err != nil { + sn.labelsRequestErrors.Inc() + err = fmt.Errorf("cannot get labels from vmstorage %s: %s", sn.connPool.Addr(), err) + } + resultsCh <- nodeResult{ + labels: labels, + err: err, + } + }(sn) + } + + // Collect results + var labels []string + var errors []error + for i := 0; i < len(storageNodes); i++ { + // There is no need in timer here, since all the goroutines executing + // sn.getLabels must be finished until the deadline. + nr := <-resultsCh + if nr.err != nil { + errors = append(errors, nr.err) + continue + } + labels = append(labels, nr.labels...) + } + isPartialResult := false + if len(errors) > 0 { + if len(labels) == 0 { + // Return only the first error, since it has no sense in returning all errors. + return nil, true, fmt.Errorf("error occured during fetching labels: %s", errors[0]) + } + + // Just log errors and return partial results. + // This allows gracefully degrade vmselect in the case + // if certain storageNodes are temporarily unavailable. + partialLabelsResults.Inc() + // Log only the first error, since it has no sense in returning all errors. + logger.Errorf("certain storageNodes are unhealthy when fetching labels: %s", errors[0]) + isPartialResult = true + } + + // Deduplicate labels + labels = deduplicateStrings(labels) // Substitute "" with "__name__" for i := range labels { @@ -378,101 +458,217 @@ func GetLabels(deadline Deadline) ([]string, error) { // Sort labels like Prometheus does sort.Strings(labels) - return labels, nil + return labels, isPartialResult, nil } // GetLabelValues returns label values for the given labelName // until the given deadline. -func GetLabelValues(labelName string, deadline Deadline) ([]string, error) { +func GetLabelValues(at *auth.Token, labelName string, deadline Deadline) ([]string, bool, error) { if labelName == "__name__" { labelName = "" } - // Search for tag values - labelValues, err := vmstorage.SearchTagValues([]byte(labelName), *maxTagValuesPerSearch) - if err != nil { - return nil, fmt.Errorf("error during label values search for labelName=%q: %s", labelName, err) + // Send the query to all the storage nodes in parallel. + type nodeResult struct { + labelValues []string + err error } + resultsCh := make(chan nodeResult, len(storageNodes)) + for _, sn := range storageNodes { + go func(sn *storageNode) { + sn.labelValuesRequests.Inc() + labelValues, err := sn.getLabelValues(at.AccountID, at.ProjectID, labelName, deadline) + if err != nil { + sn.labelValuesRequestErrors.Inc() + err = fmt.Errorf("cannot get label values from vmstorage %s: %s", sn.connPool.Addr(), err) + } + resultsCh <- nodeResult{ + labelValues: labelValues, + err: err, + } + }(sn) + } + + // Collect results + var labelValues []string + var errors []error + for i := 0; i < len(storageNodes); i++ { + // There is no need in timer here, since all the goroutines executing + // sn.getLabelValues must be finished until the deadline. + nr := <-resultsCh + if nr.err != nil { + errors = append(errors, nr.err) + continue + } + labelValues = append(labelValues, nr.labelValues...) + } + isPartialResult := false + if len(errors) > 0 { + if len(labelValues) == 0 { + // Return only the first error, since it has no sense in returning all errors. + return nil, true, fmt.Errorf("error occured during fetching labels: %s", errors[0]) + } + + // Just log errors and return partial results. + // This allows gracefully degrade vmselect in the case + // if certain storageNodes are temporarily unavailable. + partialLabelValuesResults.Inc() + // Log only the first error, since it has no sense in returning all errors. + logger.Errorf("certain storageNodes are unhealthy when fetching labels: %s", errors[0]) + isPartialResult = true + } + + // Deduplicate labels + labelValues = deduplicateStrings(labelValues) // Sort labelValues like Prometheus does sort.Strings(labelValues) - return labelValues, nil + return labelValues, isPartialResult, nil } -// GetSeriesCount returns the number of unique series. -func GetSeriesCount(deadline Deadline) (uint64, error) { - n, err := vmstorage.GetSeriesCount() - if err != nil { - return 0, fmt.Errorf("error during series count request: %s", err) +func deduplicateStrings(a []string) []string { + m := make(map[string]bool, len(a)) + for _, s := range a { + m[s] = true } - return n, nil -} - -func getStorageSearch() *storage.Search { - v := ssPool.Get() - if v == nil { - return &storage.Search{} + a = a[:0] + for s := range m { + a = append(a, s) } - return v.(*storage.Search) + return a } -func putStorageSearch(sr *storage.Search) { - n := atomic.LoadUint64(&sr.MissingMetricNamesForMetricID) - missingMetricNamesForMetricID.Add(int(n)) - sr.MustClose() - ssPool.Put(sr) +// GetSeriesCount returns the number of unique series for the given at. +func GetSeriesCount(at *auth.Token, deadline Deadline) (uint64, bool, error) { + // Send the query to all the storage nodes in parallel. + type nodeResult struct { + n uint64 + err error + } + resultsCh := make(chan nodeResult, len(storageNodes)) + for _, sn := range storageNodes { + go func(sn *storageNode) { + sn.seriesCountRequests.Inc() + n, err := sn.getSeriesCount(at.AccountID, at.ProjectID, deadline) + if err != nil { + sn.seriesCountRequestErrors.Inc() + err = fmt.Errorf("cannot get series count from vmstorage %s: %s", sn.connPool.Addr(), err) + } + resultsCh <- nodeResult{ + n: n, + err: err, + } + }(sn) + } + + // Collect results + var n uint64 + var errors []error + for i := 0; i < len(storageNodes); i++ { + // There is no need in timer here, since all the goroutines executing + // sn.getSeriesCount must be finished until the deadline. + nr := <-resultsCh + if nr.err != nil { + errors = append(errors, nr.err) + continue + } + n += nr.n + } + isPartialResult := false + if len(errors) > 0 { + if n == 0 { + // Return only the first error, since it has no sense in returning all errors. + return 0, true, fmt.Errorf("error occured during fetching series count: %s", errors[0]) + } + + // Just log errors and return partial results. + // This allows gracefully degrade vmselect in the case + // if certain storageNodes are temporarily unavailable. + partialSeriesCountResults.Inc() + // Log only the first error, since it has no sense in returning all errors. + logger.Errorf("certain storageNodes are unhealthy when fetching series count: %s", errors[0]) + isPartialResult = true + } + + return n, isPartialResult, nil } -var ssPool sync.Pool - -var missingMetricNamesForMetricID = metrics.NewCounter(`vm_missing_metric_names_for_metric_id_total`) - // ProcessSearchQuery performs sq on storage nodes until the given deadline. -func ProcessSearchQuery(sq *storage.SearchQuery, deadline Deadline) (*Results, error) { - // Setup search. - tfss, err := setupTfss(sq.TagFilterss) - if err != nil { - return nil, err +func ProcessSearchQuery(at *auth.Token, sq *storage.SearchQuery, deadline Deadline) (*Results, bool, error) { + requestData := sq.Marshal(nil) + + // Send the query to all the storage nodes in parallel. + type nodeResult struct { + results []*storage.MetricBlock + err error } + resultsCh := make(chan nodeResult, len(storageNodes)) tr := storage.TimeRange{ MinTimestamp: sq.MinTimestamp, MaxTimestamp: sq.MaxTimestamp, } + for _, sn := range storageNodes { + go func(sn *storageNode) { + sn.searchRequests.Inc() + results, err := sn.processSearchQuery(requestData, tr, deadline) + if err != nil { + sn.searchRequestErrors.Inc() + err = fmt.Errorf("cannot perform search on vmstorage %s: %s", sn.connPool.Addr(), err) + } + resultsCh <- nodeResult{ + results: results, + err: err, + } + }(sn) + } - vmstorage.WG.Add(1) - defer vmstorage.WG.Done() - - sr := getStorageSearch() - defer putStorageSearch(sr) - sr.Init(vmstorage.Storage, tfss, tr, *maxMetricsPerSearch) - + // Collect results. + var errors []error tbf := getTmpBlocksFile() m := make(map[string][]tmpBlockAddr) - for sr.NextMetricBlock() { - addr, err := tbf.WriteBlock(sr.MetricBlock.Block) - if err != nil { - putTmpBlocksFile(tbf) - return nil, fmt.Errorf("cannot write data to temporary blocks file: %s", err) + for i := 0; i < len(storageNodes); i++ { + // There is no need in timer here, since all the goroutines executing + // sn.processSearchQuery must be finished until the deadline. + nr := <-resultsCh + if nr.err != nil { + errors = append(errors, nr.err) + continue } - if time.Until(deadline.Deadline) < 0 { - putTmpBlocksFile(tbf) - return nil, fmt.Errorf("timeout exceeded while fetching data from storage: %s", deadline.Timeout) + for _, mb := range nr.results { + addr, err := tbf.WriteBlock(mb.Block) + if err != nil { + errors = append(errors, fmt.Errorf("cannot write data to temporary blocks file: %s", err)) + break + } + metricName := mb.MetricName + m[string(metricName)] = append(m[string(metricName)], addr) } - metricName := sr.MetricBlock.MetricName - m[string(metricName)] = append(m[string(metricName)], addr) } - if err := sr.Error(); err != nil { - putTmpBlocksFile(tbf) - return nil, fmt.Errorf("search error: %s", err) + isPartialResult := false + if len(errors) > 0 { + if len(m) == 0 { + // Return only the first error, since it has no sense in returning all errors. + putTmpBlocksFile(tbf) + return nil, true, fmt.Errorf("error occured during search: %s", errors[0]) + } + + // Just log errors and return partial results. + // This allows gracefully degrade vmselect in the case + // if certain storageNodes are temporarily unavailable. + partialSearchResults.Inc() + // Log only the first error, since it has no sense in returning all errors. + logger.Errorf("certain storageNodes are unhealthy during search: %s", errors[0]) + isPartialResult = true } if err := tbf.Finalize(); err != nil { putTmpBlocksFile(tbf) - return nil, fmt.Errorf("cannot finalize temporary blocks file: %s", err) + return nil, false, fmt.Errorf("cannot finalize temporary blocks file: %s", err) } var rss Results rss.packedTimeseries = make([]packedTimeseries, len(m)) + rss.at = at rss.tr = tr rss.deadline = deadline rss.tbf = tbf @@ -483,9 +679,491 @@ func ProcessSearchQuery(sq *storage.SearchQuery, deadline Deadline) (*Results, e pts.metricName = metricName pts.addrs = addrs } - return &rss, nil + + return &rss, isPartialResult, nil } +type storageNode struct { + connPool *netutil.ConnPool + + // The channel for limiting the maximum number of concurrent queries to storageNode. + concurrentQueriesCh chan struct{} + + // The number of DeleteSeries requests to storageNode. + deleteSeriesRequests *metrics.Counter + + // The number of DeleteSeries request errors to storageNode. + deleteSeriesRequestErrors *metrics.Counter + + // The number of requests to labels. + labelsRequests *metrics.Counter + + // The number of errors during requests to labels. + labelsRequestErrors *metrics.Counter + + // The number of requests to labelValues. + labelValuesRequests *metrics.Counter + + // The number of errors during requests to labelValues. + labelValuesRequestErrors *metrics.Counter + + // The number of requests to seriesCount. + seriesCountRequests *metrics.Counter + + // The number of errors during requests to seriesCount. + seriesCountRequestErrors *metrics.Counter + + // The number of search requests to storageNode. + searchRequests *metrics.Counter + + // The number of search request errors to storageNode. + searchRequestErrors *metrics.Counter + + // The number of metric blocks read. + metricBlocksRead *metrics.Counter + + // The number of read metric rows. + metricRowsRead *metrics.Counter +} + +func (sn *storageNode) deleteMetrics(requestData []byte, deadline Deadline) (int, error) { + var deletedCount int + f := func(bc *handshake.BufferedConn) error { + n, err := sn.deleteMetricsOnConn(bc, requestData) + if err != nil { + return err + } + deletedCount += n + return nil + } + if err := sn.execOnConn("deleteMetrics_v2", f, deadline); err != nil { + // Try again before giving up. + // There is no need in zeroing deletedCount. + if err = sn.execOnConn("deleteMetrics_v2", f, deadline); err != nil { + return deletedCount, err + } + } + return deletedCount, nil +} + +func (sn *storageNode) getLabels(accountID, projectID uint32, deadline Deadline) ([]string, error) { + var labels []string + f := func(bc *handshake.BufferedConn) error { + ls, err := sn.getLabelsOnConn(bc, accountID, projectID) + if err != nil { + return err + } + labels = ls + return nil + } + if err := sn.execOnConn("labels", f, deadline); err != nil { + // Try again before giving up. + labels = nil + if err = sn.execOnConn("labels", f, deadline); err != nil { + return nil, err + } + } + return labels, nil +} + +func (sn *storageNode) getLabelValues(accountID, projectID uint32, labelName string, deadline Deadline) ([]string, error) { + var labelValues []string + f := func(bc *handshake.BufferedConn) error { + lvs, err := sn.getLabelValuesOnConn(bc, accountID, projectID, labelName) + if err != nil { + return err + } + labelValues = lvs + return nil + } + if err := sn.execOnConn("labelValues", f, deadline); err != nil { + // Try again before giving up. + labelValues = nil + if err = sn.execOnConn("labelValues", f, deadline); err != nil { + return nil, err + } + } + return labelValues, nil +} + +func (sn *storageNode) getSeriesCount(accountID, projectID uint32, deadline Deadline) (uint64, error) { + var n uint64 + f := func(bc *handshake.BufferedConn) error { + nn, err := sn.getSeriesCountOnConn(bc, accountID, projectID) + if err != nil { + return err + } + n = nn + return nil + } + if err := sn.execOnConn("seriesCount", f, deadline); err != nil { + // Try again before giving up. + n = 0 + if err = sn.execOnConn("seriesCount", f, deadline); err != nil { + return 0, err + } + } + return n, nil +} + +func (sn *storageNode) processSearchQuery(requestData []byte, tr storage.TimeRange, deadline Deadline) ([]*storage.MetricBlock, error) { + var results []*storage.MetricBlock + f := func(bc *handshake.BufferedConn) error { + rs, err := sn.processSearchQueryOnConn(bc, requestData, tr) + if err != nil { + return err + } + results = rs + return nil + } + if err := sn.execOnConn("search_v2", f, deadline); err != nil { + // Try again before giving up. + results = nil + if err = sn.execOnConn("search_v2", f, deadline); err != nil { + return nil, err + } + } + return results, nil +} + +func (sn *storageNode) execOnConn(rpcName string, f func(bc *handshake.BufferedConn) error, deadline Deadline) error { + select { + case sn.concurrentQueriesCh <- struct{}{}: + default: + return fmt.Errorf("too many concurrent queries (more than %d)", cap(sn.concurrentQueriesCh)) + } + defer func() { + <-sn.concurrentQueriesCh + }() + + bc, err := sn.connPool.Get() + if err != nil { + return fmt.Errorf("cannot obtain connection from a pool: %s", err) + } + if err := bc.SetDeadline(deadline.Deadline); err != nil { + _ = bc.Close() + logger.Panicf("FATAL: cannot set connection deadline: %s", err) + } + if err := writeBytes(bc, []byte(rpcName)); err != nil { + // Close the connection instead of returning it to the pool, + // since it may be broken. + _ = bc.Close() + return fmt.Errorf("cannot send rpcName=%q to the server: %s", rpcName, err) + } + + if err := f(bc); err != nil { + remoteAddr := bc.RemoteAddr() + if _, ok := err.(*errRemote); ok { + // Remote error. The connection may be re-used. Return it to the pool. + sn.connPool.Put(bc) + } else { + // Local error. + // Close the connection instead of returning it to the pool, + // since it may be broken. + _ = bc.Close() + } + return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %s", rpcName, remoteAddr, deadline.Timeout, err) + } + // Return the connection back to the pool, assuming it is healthy. + sn.connPool.Put(bc) + return nil +} + +type errRemote struct { + msg string +} + +func (er *errRemote) Error() string { + return er.msg +} + +func (sn *storageNode) deleteMetricsOnConn(bc *handshake.BufferedConn, requestData []byte) (int, error) { + // Send the request to sn + if err := writeBytes(bc, requestData); err != nil { + return 0, fmt.Errorf("cannot send deleteMetrics request to conn: %s", err) + } + if err := bc.Flush(); err != nil { + return 0, fmt.Errorf("cannot flush deleteMetrics request to conn: %s", err) + } + + // Read response error. + buf, err := readBytes(nil, bc, maxErrorMessageSize) + if err != nil { + return 0, fmt.Errorf("cannot read error message: %s", err) + } + if len(buf) > 0 { + return 0, &errRemote{msg: string(buf)} + } + + // Read deletedCount + deletedCount, err := readUint64(bc) + if err != nil { + return 0, fmt.Errorf("cannot read deletedCount value: %s", err) + } + return int(deletedCount), nil +} + +const maxLabelsSize = 16 * 1024 * 1024 + +func (sn *storageNode) getLabelsOnConn(bc *handshake.BufferedConn, accountID, projectID uint32) ([]string, error) { + // Send the request to sn. + if err := writeUint32(bc, accountID); err != nil { + return nil, fmt.Errorf("cannot send accountID=%d to conn: %s", accountID, err) + } + if err := writeUint32(bc, projectID); err != nil { + return nil, fmt.Errorf("cannot send projectID=%d to conn: %s", projectID, err) + } + if err := bc.Flush(); err != nil { + return nil, fmt.Errorf("cannot flush request to conn: %s", err) + } + + // Read response error. + buf, err := readBytes(nil, bc, maxErrorMessageSize) + if err != nil { + return nil, fmt.Errorf("cannot read error message: %s", err) + } + if len(buf) > 0 { + return nil, &errRemote{msg: string(buf)} + } + + // Read response + var labels []string + for { + buf, err = readBytes(buf[:0], bc, maxLabelsSize) + if err != nil { + return nil, fmt.Errorf("cannot read labels: %s", err) + } + if len(buf) == 0 { + // Reached the end of the response + return labels, nil + } + labels = append(labels, string(buf)) + } +} + +const maxLabelValueSize = 16 * 1024 * 1024 + +func (sn *storageNode) getLabelValuesOnConn(bc *handshake.BufferedConn, accountID, projectID uint32, labelName string) ([]string, error) { + // Send the request to sn. + if err := writeUint32(bc, accountID); err != nil { + return nil, fmt.Errorf("cannot send accountID=%d to conn: %s", accountID, err) + } + if err := writeUint32(bc, projectID); err != nil { + return nil, fmt.Errorf("cannot send projectID=%d to conn: %s", projectID, err) + } + if err := writeBytes(bc, []byte(labelName)); err != nil { + return nil, fmt.Errorf("cannot send labelName=%q to conn: %s", labelName, err) + } + if err := bc.Flush(); err != nil { + return nil, fmt.Errorf("cannot flush labelName to conn: %s", err) + } + + // Read response error. + buf, err := readBytes(nil, bc, maxErrorMessageSize) + if err != nil { + return nil, fmt.Errorf("cannot read error message: %s", err) + } + if len(buf) > 0 { + return nil, &errRemote{msg: string(buf)} + } + + // Read response + var labelValues []string + for { + buf, err = readBytes(buf[:0], bc, maxLabelValueSize) + if err != nil { + return nil, fmt.Errorf("cannot read labelValue: %s", err) + } + if len(buf) == 0 { + // Reached the end of the response + return labelValues, nil + } + labelValues = append(labelValues, string(buf)) + } +} + +func (sn *storageNode) getSeriesCountOnConn(bc *handshake.BufferedConn, accountID, projectID uint32) (uint64, error) { + // Send the request to sn. + if err := writeUint32(bc, accountID); err != nil { + return 0, fmt.Errorf("cannot send accountID=%d to conn: %s", accountID, err) + } + if err := writeUint32(bc, projectID); err != nil { + return 0, fmt.Errorf("cannot send projectID=%d to conn: %s", projectID, err) + } + if err := bc.Flush(); err != nil { + return 0, fmt.Errorf("cannot flush labelName to conn: %s", err) + } + + // Read response error. + buf, err := readBytes(nil, bc, maxErrorMessageSize) + if err != nil { + return 0, fmt.Errorf("cannot read error message: %s", err) + } + if len(buf) > 0 { + return 0, &errRemote{msg: string(buf)} + } + + // Read response + n, err := readUint64(bc) + if err != nil { + return 0, fmt.Errorf("cannot read series count: %s", err) + } + return n, nil +} + +// maxMetricBlockSize is the maximum size of serialized MetricBlock. +const maxMetricBlockSize = 1024 * 1024 + +// maxErrorMessageSize is the maximum size of error message received +// from vmstorage. +const maxErrorMessageSize = 64 * 1024 + +func (sn *storageNode) processSearchQueryOnConn(bc *handshake.BufferedConn, requestData []byte, tr storage.TimeRange) ([]*storage.MetricBlock, error) { + // Send the request to sn. + if err := writeBytes(bc, requestData); err != nil { + return nil, fmt.Errorf("cannot write requestData: %s", err) + } + if err := bc.Flush(); err != nil { + return nil, fmt.Errorf("cannot flush requestData to conn: %s", err) + } + + var err error + var buf []byte + + // Read response error. + buf, err = readBytes(buf[:0], bc, maxErrorMessageSize) + if err != nil { + return nil, fmt.Errorf("cannot read error message: %s", err) + } + if len(buf) > 0 { + return nil, &errRemote{msg: string(buf)} + } + + // Read response. It may consist of multiple MetricBlocks. + var results []*storage.MetricBlock + metricBlocksRead := 0 + for { + buf, err = readBytes(buf[:0], bc, maxMetricBlockSize) + if err != nil { + return nil, fmt.Errorf("cannot read MetricBlock #%d: %s", metricBlocksRead, err) + } + if len(buf) == 0 { + // Reached the end of the response + return results, nil + } + var mb storage.MetricBlock + mb.Block = &storage.Block{} + tail, err := mb.Unmarshal(buf) + if err != nil { + return nil, fmt.Errorf("cannot unmarshal MetricBlock: %s", err) + } + if len(tail) != 0 { + return nil, fmt.Errorf("non-empty tail after unmarshaling MetricBlock: (len=%d) %q", len(tail), tail) + } + metricBlocksRead++ + sn.metricBlocksRead.Inc() + sn.metricRowsRead.Add(mb.Block.RowsCount()) + results = append(results, &mb) + } +} + +func writeBytes(bc *handshake.BufferedConn, buf []byte) error { + sizeBuf := encoding.MarshalUint64(nil, uint64(len(buf))) + if _, err := bc.Write(sizeBuf); err != nil { + return err + } + if _, err := bc.Write(buf); err != nil { + return err + } + return nil +} + +func writeUint32(bc *handshake.BufferedConn, n uint32) error { + buf := encoding.MarshalUint32(nil, n) + if _, err := bc.Write(buf); err != nil { + return err + } + return nil +} + +func readBytes(buf []byte, bc *handshake.BufferedConn, maxDataSize int) ([]byte, error) { + buf = bytesutil.Resize(buf, 8) + if _, err := io.ReadFull(bc, buf); err != nil { + return buf, fmt.Errorf("error read data size: %s", err) + } + dataSize := encoding.UnmarshalUint64(buf) + if dataSize > uint64(maxDataSize) { + return buf, fmt.Errorf("too big data size: %d; it mustn't exceed %d bytes", dataSize, maxDataSize) + } + buf = bytesutil.Resize(buf, int(dataSize)) + if dataSize == 0 { + return buf, nil + } + if _, err := io.ReadFull(bc, buf); err != nil { + return buf, fmt.Errorf("cannot read data with size %d: %s", dataSize, err) + } + return buf, nil +} + +func readUint64(bc *handshake.BufferedConn) (uint64, error) { + var buf [8]byte + if _, err := io.ReadFull(bc, buf[:]); err != nil { + return 0, fmt.Errorf("cannot read uint64: %s", err) + } + n := encoding.UnmarshalUint64(buf[:]) + return n, nil +} + +var storageNodes []*storageNode + +// InitStorageNodes initializes storage nodes' connections to the given addrs. +func InitStorageNodes(addrs []string) { + if len(addrs) == 0 { + logger.Panicf("BUG: addrs must be non-empty") + } + + for _, addr := range addrs { + sn := &storageNode{ + // There is no need in requests compression, since they are usually very small. + connPool: netutil.NewConnPool("vmselect", addr, handshake.VMSelectClient, 0), + + concurrentQueriesCh: make(chan struct{}, maxConcurrentQueriesPerStorageNode), + + deleteSeriesRequests: metrics.NewCounter(fmt.Sprintf(`vm_requests_total{action="deleteSeries", type="rpcClient", name="vmselect", addr=%q}`, addr)), + deleteSeriesRequestErrors: metrics.NewCounter(fmt.Sprintf(`vm_request_errors_total{action="deleteSeries", type="rpcClient", name="vmselect", addr=%q}`, addr)), + labelsRequests: metrics.NewCounter(fmt.Sprintf(`vm_requests_total{action="labels", type="rpcClient", name="vmselect", addr=%q}`, addr)), + labelsRequestErrors: metrics.NewCounter(fmt.Sprintf(`vm_request_errors_total{action="labels", type="rpcClient", name="vmselect", addr=%q}`, addr)), + labelValuesRequests: metrics.NewCounter(fmt.Sprintf(`vm_requests_total{action="labelValues", type="rpcClient", name="vmselect", addr=%q}`, addr)), + labelValuesRequestErrors: metrics.NewCounter(fmt.Sprintf(`vm_request_errors_total{action="labelValues", type="rpcClient", name="vmselect", addr=%q}`, addr)), + seriesCountRequests: metrics.NewCounter(fmt.Sprintf(`vm_requests_total{action="seriesCount", type="rpcClient", name="vmselect", addr=%q}`, addr)), + seriesCountRequestErrors: metrics.NewCounter(fmt.Sprintf(`vm_request_errors_total{action="seriesCount", type="rpcClient", name="vmselect", addr=%q}`, addr)), + searchRequests: metrics.NewCounter(fmt.Sprintf(`vm_requests_total{action="search", type="rpcClient", name="vmselect", addr=%q}`, addr)), + searchRequestErrors: metrics.NewCounter(fmt.Sprintf(`vm_request_errors_total{action="search", type="rpcClient", name="vmselect", addr=%q}`, addr)), + metricBlocksRead: metrics.NewCounter(fmt.Sprintf(`vm_metric_blocks_read_total{name="vmselect", addr=%q}`, addr)), + metricRowsRead: metrics.NewCounter(fmt.Sprintf(`vm_metric_rows_read_total{name="vmselect", addr=%q}`, addr)), + } + metrics.NewGauge(fmt.Sprintf(`vm_concurrent_queries{name="vmselect", addr=%q}`, addr), func() float64 { + return float64(len(sn.concurrentQueriesCh)) + }) + storageNodes = append(storageNodes, sn) + } +} + +// Stop gracefully stops netstorage. +func Stop() { + // Nothing to do at the moment. +} + +var ( + partialLabelsResults = metrics.NewCounter(`vm_partial_labels_results_total{name="vmselect"}`) + partialLabelValuesResults = metrics.NewCounter(`vm_partial_label_values_results_total{name="vmselect"}`) + partialSeriesCountResults = metrics.NewCounter(`vm_partial_series_count_results_total{name="vmselect"}`) + partialSearchResults = metrics.NewCounter(`vm_partial_search_results_total{name="vmselect"}`) +) + +// The maximum number of concurrent queries per storageNode. +const maxConcurrentQueriesPerStorageNode = 100 + func getResult() *Result { v := rsPool.Get() if v == nil { @@ -505,21 +1183,6 @@ func putResult(rs *Result) { var rsPool sync.Pool -func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error) { - tfss := make([]*storage.TagFilters, 0, len(tagFilterss)) - for _, tagFilters := range tagFilterss { - tfs := storage.NewTagFilters() - for i := range tagFilters { - tf := &tagFilters[i] - if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil { - return nil, fmt.Errorf("cannot parse tag filter %s: %s", tf, err) - } - } - tfss = append(tfss, tfs) - } - return tfss, nil -} - // Deadline contains deadline with the corresponding timeout for pretty error messages. type Deadline struct { Deadline time.Time diff --git a/app/vmselect/prometheus/prometheus.go b/app/vmselect/prometheus/prometheus.go index 14b581ffc..f16f3579d 100644 --- a/app/vmselect/prometheus/prometheus.go +++ b/app/vmselect/prometheus/prometheus.go @@ -12,6 +12,9 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" "github.com/VictoriaMetrics/metrics" "github.com/valyala/quicktemplate" @@ -20,8 +23,14 @@ import ( var ( maxQueryDuration = flag.Duration("search.maxQueryDuration", time.Second*30, "The maximum time for search query execution") maxQueryLen = flag.Int("search.maxQueryLen", 16*1024, "The maximum search query length in bytes") + + selectNodes flagutil.Array ) +func init() { + flag.Var(&selectNodes, "selectNode", "vmselect address, usage -selectNode=vmselect-host1:8481 -selectNode=vmselect-host2:8481") +} + // Default step used if not set. const defaultStep = 5 * 60 * 1000 @@ -30,7 +39,7 @@ const defaultStep = 5 * 60 * 1000 const latencyOffset = 60 * 1000 // FederateHandler implements /federate . See https://prometheus.io/docs/prometheus/latest/federation/ -func FederateHandler(w http.ResponseWriter, r *http.Request) error { +func FederateHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() ct := currentTime() if err := r.ParseForm(); err != nil { @@ -49,11 +58,13 @@ func FederateHandler(w http.ResponseWriter, r *http.Request) error { return err } sq := &storage.SearchQuery{ + AccountID: at.AccountID, + ProjectID: at.ProjectID, MinTimestamp: start, MaxTimestamp: end, TagFilterss: tagFilterss, } - rss, err := netstorage.ProcessSearchQuery(sq, deadline) + rss, _, err := netstorage.ProcessSearchQuery(at, sq, deadline) if err != nil { return fmt.Errorf("cannot fetch data for %q: %s", sq, err) } @@ -87,7 +98,7 @@ func FederateHandler(w http.ResponseWriter, r *http.Request) error { var federateDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/federate"}`) // ExportHandler exports data in raw format from /api/v1/export. -func ExportHandler(w http.ResponseWriter, r *http.Request) error { +func ExportHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() ct := currentTime() if err := r.ParseForm(); err != nil { @@ -106,7 +117,7 @@ func ExportHandler(w http.ResponseWriter, r *http.Request) error { if start >= end { start = end - defaultStep } - if err := exportHandler(w, matches, start, end, format, deadline); err != nil { + if err := exportHandler(at, w, matches, start, end, format, deadline); err != nil { return err } exportDuration.UpdateDuration(startTime) @@ -115,7 +126,7 @@ func ExportHandler(w http.ResponseWriter, r *http.Request) error { var exportDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/export"}`) -func exportHandler(w http.ResponseWriter, matches []string, start, end int64, format string, deadline netstorage.Deadline) error { +func exportHandler(at *auth.Token, w http.ResponseWriter, matches []string, start, end int64, format string, deadline netstorage.Deadline) error { writeResponseFunc := WriteExportStdResponse writeLineFunc := WriteExportJSONLine contentType := "application/json" @@ -132,14 +143,20 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo return err } sq := &storage.SearchQuery{ + AccountID: at.AccountID, + ProjectID: at.ProjectID, MinTimestamp: start, MaxTimestamp: end, TagFilterss: tagFilterss, } - rss, err := netstorage.ProcessSearchQuery(sq, deadline) + rss, isPartial, err := netstorage.ProcessSearchQuery(at, sq, deadline) if err != nil { return fmt.Errorf("cannot fetch data for %q: %s", sq, err) } + if isPartial { + rss.Cancel() + return fmt.Errorf("some of the storage nodes are unavailable at the moment") + } resultsCh := make(chan *quicktemplate.ByteBuffer, runtime.GOMAXPROCS(-1)) doneCh := make(chan error) @@ -166,7 +183,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo // DeleteHandler processes /api/v1/admin/tsdb/delete_series prometheus API request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series -func DeleteHandler(r *http.Request) error { +func DeleteHandler(at *auth.Token, r *http.Request) error { startTime := time.Now() if err := r.ParseForm(); err != nil { return fmt.Errorf("cannot parse request form values: %s", err) @@ -175,19 +192,25 @@ func DeleteHandler(r *http.Request) error { return fmt.Errorf("start and end aren't supported. Remove these args from the query in order to delete all the matching metrics") } matches := r.Form["match[]"] + deadline := getDeadline(r) tagFilterss, err := getTagFilterssFromMatches(matches) if err != nil { return err } sq := &storage.SearchQuery{ + AccountID: at.AccountID, + ProjectID: at.ProjectID, TagFilterss: tagFilterss, } - deletedCount, err := netstorage.DeleteSeries(sq) + deletedCount, err := netstorage.DeleteSeries(at, sq, deadline) if err != nil { return fmt.Errorf("cannot delete time series matching %q: %s", matches, err) } if deletedCount > 0 { - promql.ResetRollupResultCache() + // Reset rollup result cache on all the vmselect nodes, + // since the cache may contain deleted data. + // TODO: reset only cache for (account, project) + resetRollupResultCaches() } deleteDuration.UpdateDuration(startTime) return nil @@ -195,13 +218,45 @@ func DeleteHandler(r *http.Request) error { var deleteDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/admin/tsdb/delete_series"}`) +func resetRollupResultCaches() { + if len(selectNodes) == 0 { + logger.Panicf("BUG: missing -selectNode flag") + } + for _, selectNode := range selectNodes { + callURL := fmt.Sprintf("http://%s/internal/resetRollupResultCache", selectNode) + resp, err := httpClient.Get(callURL) + if err != nil { + logger.Errorf("error when accessing %q: %s", callURL, err) + resetRollupResultCacheErrors.Inc() + continue + } + if resp.StatusCode != http.StatusOK { + _ = resp.Body.Close() + logger.Errorf("unexpected status code at %q; got %d; want %d", callURL, resp.StatusCode, http.StatusOK) + resetRollupResultCacheErrors.Inc() + continue + } + _ = resp.Body.Close() + } + resetRollupResultCacheCalls.Inc() +} + +var ( + resetRollupResultCacheErrors = metrics.NewCounter("vm_reset_rollup_result_cache_errors_total") + resetRollupResultCacheCalls = metrics.NewCounter("vm_reset_rollup_result_cache_calls_total") +) + +var httpClient = &http.Client{ + Timeout: time.Second * 5, +} + // LabelValuesHandler processes /api/v1/label//values request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values -func LabelValuesHandler(labelName string, w http.ResponseWriter, r *http.Request) error { +func LabelValuesHandler(at *auth.Token, labelName string, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() deadline := getDeadline(r) - labelValues, err := netstorage.GetLabelValues(labelName, deadline) + labelValues, _, err := netstorage.GetLabelValues(at, labelName, deadline) if err != nil { return fmt.Errorf(`cannot obtain label values for %q: %s`, labelName, err) } @@ -217,10 +272,10 @@ var labelValuesDuration = metrics.NewSummary(`vm_request_duration_seconds{path=" // LabelsHandler processes /api/v1/labels request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names -func LabelsHandler(w http.ResponseWriter, r *http.Request) error { +func LabelsHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() deadline := getDeadline(r) - labels, err := netstorage.GetLabels(deadline) + labels, _, err := netstorage.GetLabels(at, deadline) if err != nil { return fmt.Errorf("cannot obtain labels: %s", err) } @@ -234,13 +289,14 @@ func LabelsHandler(w http.ResponseWriter, r *http.Request) error { var labelsDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/labels"}`) // SeriesCountHandler processes /api/v1/series/count request. -func SeriesCountHandler(w http.ResponseWriter, r *http.Request) error { +func SeriesCountHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() deadline := getDeadline(r) - n, err := netstorage.GetSeriesCount(deadline) + n, _, err := netstorage.GetSeriesCount(at, deadline) if err != nil { return fmt.Errorf("cannot obtain series count: %s", err) } + w.Header().Set("Content-Type", "application/json") WriteSeriesCountResponse(w, n) seriesCountDuration.UpdateDuration(startTime) @@ -252,7 +308,7 @@ var seriesCountDuration = metrics.NewSummary(`vm_request_duration_seconds{path=" // SeriesHandler processes /api/v1/series request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers -func SeriesHandler(w http.ResponseWriter, r *http.Request) error { +func SeriesHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() ct := currentTime() @@ -272,11 +328,13 @@ func SeriesHandler(w http.ResponseWriter, r *http.Request) error { start = end - defaultStep } sq := &storage.SearchQuery{ + AccountID: at.AccountID, + ProjectID: at.ProjectID, MinTimestamp: start, MaxTimestamp: end, TagFilterss: tagFilterss, } - rss, err := netstorage.ProcessSearchQuery(sq, deadline) + rss, _, err := netstorage.ProcessSearchQuery(at, sq, deadline) if err != nil { return fmt.Errorf("cannot fetch data for %q: %s", sq, err) } @@ -315,7 +373,7 @@ var seriesDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/ // QueryHandler processes /api/v1/query request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries -func QueryHandler(w http.ResponseWriter, r *http.Request) error { +func QueryHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() ct := currentTime() @@ -350,7 +408,7 @@ func QueryHandler(w http.ResponseWriter, r *http.Request) error { start -= offset end := start start = end - window - if err := exportHandler(w, []string{childQuery}, start, end, "promapi", deadline); err != nil { + if err := exportHandler(at, w, []string{childQuery}, start, end, "promapi", deadline); err != nil { return err } queryDuration.UpdateDuration(startTime) @@ -358,10 +416,11 @@ func QueryHandler(w http.ResponseWriter, r *http.Request) error { } ec := promql.EvalConfig{ - Start: start, - End: start, - Step: step, - Deadline: deadline, + AuthToken: at, + Start: start, + End: start, + Step: step, + Deadline: deadline, } result, err := promql.Exec(&ec, query) if err != nil { @@ -379,7 +438,7 @@ var queryDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v // QueryRangeHandler processes /api/v1/query_range request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries -func QueryRangeHandler(w http.ResponseWriter, r *http.Request) error { +func QueryRangeHandler(at *auth.Token, w http.ResponseWriter, r *http.Request) error { startTime := time.Now() ct := currentTime() @@ -403,11 +462,12 @@ func QueryRangeHandler(w http.ResponseWriter, r *http.Request) error { start, end = promql.AdjustStartEnd(start, end, step) ec := promql.EvalConfig{ - Start: start, - End: end, - Step: step, - Deadline: deadline, - MayCache: mayCache, + AuthToken: at, + Start: start, + End: end, + Step: step, + Deadline: deadline, + MayCache: mayCache, } result, err := promql.Exec(&ec, query) if err != nil { diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go index 7a26d81f1..f0f94b81d 100644 --- a/app/vmselect/promql/eval.go +++ b/app/vmselect/promql/eval.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" @@ -62,9 +63,10 @@ func AdjustStartEnd(start, end, step int64) (int64, int64) { // EvalConfig is the configuration required for query evaluation via Exec type EvalConfig struct { - Start int64 - End int64 - Step int64 + AuthToken *auth.Token + Start int64 + End int64 + Step int64 Deadline netstorage.Deadline @@ -77,6 +79,7 @@ type EvalConfig struct { // newEvalConfig returns new EvalConfig copy from src. func newEvalConfig(src *EvalConfig) *EvalConfig { var ec EvalConfig + ec.AuthToken = src.AuthToken ec.Start = src.Start ec.End = src.End ec.Step = src.Step @@ -510,11 +513,14 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc, me // Fetch the remaining part of the result. sq := &storage.SearchQuery{ + AccountID: ec.AuthToken.AccountID, + ProjectID: ec.AuthToken.ProjectID, MinTimestamp: start - window - maxSilenceInterval, MaxTimestamp: ec.End + ec.Step, TagFilterss: [][]storage.TagFilter{me.TagFilters}, } - rss, err := netstorage.ProcessSearchQuery(sq, ec.Deadline) + + rss, denyCache, err := netstorage.ProcessSearchQuery(ec.AuthToken, sq, ec.Deadline) if err != nil { return nil, err } @@ -570,8 +576,9 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc, me } } tss = mergeTimeseries(tssCached, tss, start, ec) - rollupResultCacheV.Put(name, ec, me, window, tss) - + if !denyCache { + rollupResultCacheV.Put(name, ec, me, window, tss) + } return tss, nil } @@ -628,6 +635,8 @@ var bbPool bytesutil.ByteBufferPool func evalNumber(ec *EvalConfig, n float64) []*timeseries { var ts timeseries ts.denyReuse = true + ts.MetricName.AccountID = ec.AuthToken.AccountID + ts.MetricName.ProjectID = ec.AuthToken.ProjectID timestamps := ec.getSharedTimestamps() values := make([]float64, len(timestamps)) for i := range timestamps { diff --git a/app/vmselect/promql/exec_test.go b/app/vmselect/promql/exec_test.go index e683fcf02..8219c9b36 100644 --- a/app/vmselect/promql/exec_test.go +++ b/app/vmselect/promql/exec_test.go @@ -5,6 +5,7 @@ import ( "time" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" ) @@ -47,15 +48,24 @@ func TestExpandWithExprsError(t *testing.T) { } func TestExecSuccess(t *testing.T) { + accountID := uint32(123) + projectID := uint32(567) start := int64(1000e3) end := int64(2000e3) step := int64(200e3) timestampsExpected := []int64{1000e3, 1200e3, 1400e3, 1600e3, 1800e3, 2000e3} - metricNameExpected := storage.MetricName{} + metricNameExpected := storage.MetricName{ + AccountID: accountID, + ProjectID: projectID, + } f := func(q string, resultExpected []netstorage.Result) { t.Helper() ec := &EvalConfig{ + AuthToken: &auth.Token{ + AccountID: accountID, + ProjectID: projectID, + }, Start: start, End: end, Step: step, @@ -3423,6 +3433,10 @@ func TestExecError(t *testing.T) { f := func(q string) { t.Helper() ec := &EvalConfig{ + AuthToken: &auth.Token{ + AccountID: 123, + ProjectID: 567, + }, Start: 1000, End: 2000, Step: 100, @@ -3574,6 +3588,12 @@ func testResultsEqual(t *testing.T, result, resultExpected []netstorage.Result) func testMetricNamesEqual(t *testing.T, mn, mnExpected *storage.MetricName) { t.Helper() + if mn.AccountID != mnExpected.AccountID { + t.Fatalf(`unexpected accountID; got %d; want %d`, mn.AccountID, mnExpected.AccountID) + } + if mn.ProjectID != mnExpected.ProjectID { + t.Fatalf(`unexpected projectID; got %d; want %d`, mn.ProjectID, mnExpected.ProjectID) + } if string(mn.MetricGroup) != string(mnExpected.MetricGroup) { t.Fatalf(`unexpected MetricGroup; got %q; want %q`, mn.MetricGroup, mnExpected.MetricGroup) } diff --git a/app/vmselect/promql/rollup_result_cache.go b/app/vmselect/promql/rollup_result_cache.go index c12c22916..28d5856ed 100644 --- a/app/vmselect/promql/rollup_result_cache.go +++ b/app/vmselect/promql/rollup_result_cache.go @@ -8,6 +8,7 @@ import ( "sync/atomic" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" @@ -37,6 +38,8 @@ var ( ) // InitRollupResultCache initializes the rollupResult cache +// +// if cachePath is empty, then the cache isn't stored to persistent disk. func InitRollupResultCache(cachePath string) { rollupResultCachePath = cachePath startTime := time.Now() @@ -106,6 +109,8 @@ func StopRollupResultCache() { } } +// TODO: convert this cache to distributed cache shared among vmselect +// instances in the cluster. type rollupResultCache struct { c *fastcache.Cache } @@ -127,7 +132,7 @@ func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExp bb := bbPool.Get() defer bbPool.Put(bb) - bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step) + bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, ec.AuthToken, me, window, ec.Step) metainfoBuf := rrc.c.Get(nil, bb.B) if len(metainfoBuf) == 0 { return nil, ec.Start @@ -145,7 +150,7 @@ func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExp if len(resultBuf) == 0 { mi.RemoveKey(key) metainfoBuf = mi.Marshal(metainfoBuf[:0]) - bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step) + bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, ec.AuthToken, me, window, ec.Step) rrc.c.Set(bb.B, metainfoBuf) return nil, ec.Start } @@ -235,7 +240,7 @@ func (rrc *rollupResultCache) Put(funcName string, ec *EvalConfig, me *metricExp bb.B = key.Marshal(bb.B[:0]) rrc.c.SetBig(bb.B, tssMarshaled) - bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step) + bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, ec.AuthToken, me, window, ec.Step) metainfoBuf := rrc.c.Get(nil, bb.B) var mi rollupResultCacheMetainfo if len(metainfoBuf) > 0 { @@ -265,8 +270,10 @@ var tooBigRollupResults = metrics.NewCounter("vm_too_big_rollup_results_total") // Increment this value every time the format of the cache changes. const rollupResultCacheVersion = 4 -func marshalRollupResultCacheKey(dst []byte, funcName string, me *metricExpr, window, step int64) []byte { +func marshalRollupResultCacheKey(dst []byte, funcName string, at *auth.Token, me *metricExpr, window, step int64) []byte { dst = append(dst, rollupResultCacheVersion) + dst = encoding.MarshalUint32(dst, at.AccountID) + dst = encoding.MarshalUint32(dst, at.ProjectID) dst = encoding.MarshalUint64(dst, uint64(len(funcName))) dst = append(dst, funcName...) dst = encoding.MarshalInt64(dst, window) diff --git a/app/vmselect/promql/rollup_result_cache_test.go b/app/vmselect/promql/rollup_result_cache_test.go index e2cdfac2e..f86c2a660 100644 --- a/app/vmselect/promql/rollup_result_cache_test.go +++ b/app/vmselect/promql/rollup_result_cache_test.go @@ -3,6 +3,7 @@ package promql import ( "testing" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" ) @@ -15,6 +16,11 @@ func TestRollupResultCache(t *testing.T) { End: 2000, Step: 200, + AuthToken: &auth.Token{ + AccountID: 333, + ProjectID: 843, + }, + MayCache: true, } me := &metricExpr{ diff --git a/app/vmstorage/Makefile b/app/vmstorage/Makefile new file mode 100644 index 000000000..046f98e5a --- /dev/null +++ b/app/vmstorage/Makefile @@ -0,0 +1,32 @@ +# All these commands must run from repository root. + +run-vmstorage: + mkdir -p vmstorage-data + DOCKER_OPTS='-v $(shell pwd)/vmstorage-data:/vmstorage-data -p 8482:8482 -p 8400:8400 -p 8401:8401' \ + APP_NAME=vmstorage \ + ARGS='-retentionPeriod=12' \ + $(MAKE) run-via-docker + +vmstorage: + APP_NAME=vmstorage $(MAKE) app-local + +vmstorage-race: + APP_NAME=vmstorage RACE=-race $(MAKE) app-local + +vmstorage-prod: + APP_NAME=vmstorage $(MAKE) app-via-docker + +vmstorage-prod-race: + APP_NAME=vmstorage RACE=-race $(MAKE) app-via-docker + +package-vmstorage: + APP_NAME=vmstorage $(MAKE) package-via-docker + +package-vmstorage-race: + APP_NAME=vmstorage RACE=-race $(MAKE) package-via-docker + +publish-vmstorage: + APP_NAME=vmstorage $(MAKE) publish-via-docker + +publish-vmstorage-race: + APP_NAME=vmstorage RACE=-race $(MAKE) publish-via-docker diff --git a/app/vmstorage/README.md b/app/vmstorage/README.md index 6df28ba61..741a669d2 100644 --- a/app/vmstorage/README.md +++ b/app/vmstorage/README.md @@ -1,5 +1,5 @@ `vmstorage` performs the following tasks: -- Accepts inserts from `vminsert` and stores them to local storage. +- Accepts inserts from `vminsert` nodes and stores them to local storage. -- Performs select requests from `vmselect`. +- Performs select requests from `vmselect` nodes. diff --git a/app/vmstorage/deployment/Dockerfile b/app/vmstorage/deployment/Dockerfile new file mode 100644 index 000000000..0341eb7cb --- /dev/null +++ b/app/vmstorage/deployment/Dockerfile @@ -0,0 +1,7 @@ +FROM scratch +COPY --from=local/certs:1.0.2 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt +COPY bin/vmstorage-prod . +EXPOSE 8482 +EXPOSE 8400 +EXPOSE 8401 +ENTRYPOINT ["/vmstorage-prod"] diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 1e08b323a..ae612f0e5 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -1,4 +1,4 @@ -package vmstorage +package main import ( "flag" @@ -8,122 +8,84 @@ import ( "sync" "time" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/transport" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/syncwg" "github.com/VictoriaMetrics/metrics" ) var ( + httpListenAddr = flag.String("httpListenAddr", ":8482", "Address to listen for http connections") retentionPeriod = flag.Int("retentionPeriod", 1, "Retention period in months") + storageDataPath = flag.String("storageDataPath", "vmstorage-data", "Path to storage data") + vminsertAddr = flag.String("vminsertAddr", ":8400", "TCP address to accept connections from vminsert services") + vmselectAddr = flag.String("vmselectAddr", ":8401", "TCP address to accept connections from vmselect services") snapshotAuthKey = flag.String("snapshotAuthKey", "", "authKey, which must be passed in query string to /snapshot* pages") - - precisionBits = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss") - - // DataPath is a path to storage data. - DataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to storage data") ) -// Init initializes vmstorage. -func Init() { - if err := encoding.CheckPrecisionBits(uint8(*precisionBits)); err != nil { - logger.Fatalf("invalid `-precisionBits`: %s", err) - } - logger.Infof("opening storage at %q with retention period %d months", *DataPath, *retentionPeriod) +func main() { + flag.Parse() + buildinfo.Init() + logger.Init() + + logger.Infof("opening storage at %q with retention period %d months", *storageDataPath, *retentionPeriod) startTime := time.Now() - strg, err := storage.OpenStorage(*DataPath, *retentionPeriod) + strg, err := storage.OpenStorage(*storageDataPath, *retentionPeriod) if err != nil { - logger.Fatalf("cannot open a storage at %s with retention period %d months: %s", *DataPath, *retentionPeriod, err) + logger.Fatalf("cannot open a storage at %s with retention period %d months: %s", *storageDataPath, *retentionPeriod, err) } - Storage = strg var m storage.Metrics - Storage.UpdateMetrics(&m) + strg.UpdateMetrics(&m) tm := &m.TableMetrics partsCount := tm.SmallPartsCount + tm.BigPartsCount blocksCount := tm.SmallBlocksCount + tm.BigBlocksCount rowsCount := tm.SmallRowsCount + tm.BigRowsCount logger.Infof("successfully opened storage %q in %s; partsCount: %d; blocksCount: %d; rowsCount: %d", - *DataPath, time.Since(startTime), partsCount, blocksCount, rowsCount) + *storageDataPath, time.Since(startTime), partsCount, blocksCount, rowsCount) - registerStorageMetrics(Storage) -} + registerStorageMetrics(strg) -// Storage is a storage. -// -// Every storage call must be wrapped into WG.Add(1) ... WG.Done() -// for proper graceful shutdown when Stop is called. -var Storage *storage.Storage + srv, err := transport.NewServer(*vminsertAddr, *vmselectAddr, strg) + if err != nil { + logger.Fatalf("cannot create a server with vminsertAddr=%s, vmselectAddr=%s: %s", *vminsertAddr, *vmselectAddr, err) + } -// WG must be incremented before Storage call. -// -// Use syncwg instead of sync, since Add is called from concurrent goroutines. -var WG syncwg.WaitGroup + go srv.RunVMInsert() + go srv.RunVMSelect() -// AddRows adds mrs to the storage. -func AddRows(mrs []storage.MetricRow) error { - WG.Add(1) - err := Storage.AddRows(mrs, uint8(*precisionBits)) - WG.Done() - return err -} + requestHandler := newRequestHandler(strg) + go func() { + httpserver.Serve(*httpListenAddr, requestHandler) + }() -// DeleteMetrics deletes metrics matching tfss. -// -// Returns the number of deleted metrics. -func DeleteMetrics(tfss []*storage.TagFilters) (int, error) { - WG.Add(1) - n, err := Storage.DeleteMetrics(tfss) - WG.Done() - return n, err -} + sig := procutil.WaitForSigterm() + logger.Infof("service received signal %s", sig) -// SearchTagKeys searches for tag keys -func SearchTagKeys(maxTagKeys int) ([]string, error) { - WG.Add(1) - keys, err := Storage.SearchTagKeys(maxTagKeys) - WG.Done() - return keys, err -} + logger.Infof("gracefully shutting down the service") + startTime = time.Now() + srv.MustClose() + logger.Infof("successfully shut down the service in %s", time.Since(startTime)) -// SearchTagValues searches for tag values for the given tagKey -func SearchTagValues(tagKey []byte, maxTagValues int) ([]string, error) { - WG.Add(1) - values, err := Storage.SearchTagValues(tagKey, maxTagValues) - WG.Done() - return values, err -} - -// GetSeriesCount returns the number of time series in the storage. -func GetSeriesCount() (uint64, error) { - WG.Add(1) - n, err := Storage.GetSeriesCount() - WG.Done() - return n, err -} - -// Stop stops the vmstorage -func Stop() { - logger.Infof("gracefully closing the storage at %s", *DataPath) - startTime := time.Now() - WG.WaitAndBlock() - Storage.MustClose() + logger.Infof("gracefully closing the storage at %s", *storageDataPath) + startTime = time.Now() + strg.MustClose() logger.Infof("successfully closed the storage in %s", time.Since(startTime)) - logger.Infof("the storage has been stopped") + logger.Infof("the vmstorage has been stopped") } -// RequestHandler is a storage request handler. -func RequestHandler(w http.ResponseWriter, r *http.Request) bool { - path := r.URL.Path - prometheusCompatibleResponse := false - if path == "/api/v1/admin/tsdb/snapshot" { - // Handle Prometheus API - https://prometheus.io/docs/prometheus/latest/querying/api/#snapshot . - prometheusCompatibleResponse = true - path = "/snapshot/create" +func newRequestHandler(strg *storage.Storage) httpserver.RequestHandler { + return func(w http.ResponseWriter, r *http.Request) bool { + return requestHandler(w, r, strg) } +} + +func requestHandler(w http.ResponseWriter, r *http.Request, strg *storage.Storage) bool { + path := r.URL.Path if !strings.HasPrefix(path, "/snapshot") { return false } @@ -137,22 +99,18 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { switch path { case "/create": w.Header().Set("Content-Type", "application/json") - snapshotPath, err := Storage.CreateSnapshot() + snapshotPath, err := strg.CreateSnapshot() if err != nil { msg := fmt.Sprintf("cannot create snapshot: %s", err) logger.Errorf("%s", msg) fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg) return true } - if prometheusCompatibleResponse { - fmt.Fprintf(w, `{"status":"success","data":{"name":%q}}`, snapshotPath) - } else { - fmt.Fprintf(w, `{"status":"ok","snapshot":%q}`, snapshotPath) - } + fmt.Fprintf(w, `{"status":"ok","snapshot":%q}`, snapshotPath) return true case "/list": w.Header().Set("Content-Type", "application/json") - snapshots, err := Storage.ListSnapshots() + snapshots, err := strg.ListSnapshots() if err != nil { msg := fmt.Sprintf("cannot list snapshots: %s", err) logger.Errorf("%s", msg) @@ -171,7 +129,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { case "/delete": w.Header().Set("Content-Type", "application/json") snapshotName := r.FormValue("snapshot") - if err := Storage.DeleteSnapshot(snapshotName); err != nil { + if err := strg.DeleteSnapshot(snapshotName); err != nil { msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err) logger.Errorf("%s", msg) fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg) @@ -181,7 +139,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true case "/delete_all": w.Header().Set("Content-Type", "application/json") - snapshots, err := Storage.ListSnapshots() + snapshots, err := strg.ListSnapshots() if err != nil { msg := fmt.Sprintf("cannot list snapshots: %s", err) logger.Errorf("%s", msg) @@ -189,7 +147,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true } for _, snapshotName := range snapshots { - if err := Storage.DeleteSnapshot(snapshotName); err != nil { + if err := strg.DeleteSnapshot(snapshotName); err != nil { msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err) logger.Errorf("%s", msg) fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg) diff --git a/app/vmstorage/transport/server.go b/app/vmstorage/transport/server.go new file mode 100644 index 000000000..9beefd1e0 --- /dev/null +++ b/app/vmstorage/transport/server.go @@ -0,0 +1,736 @@ +package transport + +import ( + "flag" + "fmt" + "io" + "net" + "sync" + "sync/atomic" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/consts" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" + "github.com/VictoriaMetrics/metrics" +) + +var ( + maxTagKeysPerSearch = flag.Int("search.maxTagKeys", 10e3, "The maximum number of tag keys returned per search") + maxTagValuesPerSearch = flag.Int("search.maxTagValues", 10e3, "The maximum number of tag values returned per search") + maxMetricsPerSearch = flag.Int("search.maxUniqueTimeseries", 100e3, "The maximum number of unique time series each search can scan") + + precisionBits = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss") +) + +// Server processes connections from vminsert and vmselect. +type Server struct { + storage *storage.Storage + + vminsertLN net.Listener + vmselectLN net.Listener + + vminsertWG sync.WaitGroup + vmselectWG sync.WaitGroup + + vminsertConnsMap connsMap + vmselectConnsMap connsMap + + stopFlag uint64 +} + +type connsMap struct { + mu sync.Mutex + m map[net.Conn]struct{} +} + +func (cm *connsMap) Init() { + cm.m = make(map[net.Conn]struct{}) +} + +func (cm *connsMap) Add(c net.Conn) { + cm.mu.Lock() + cm.m[c] = struct{}{} + cm.mu.Unlock() +} + +func (cm *connsMap) Delete(c net.Conn) { + cm.mu.Lock() + delete(cm.m, c) + cm.mu.Unlock() +} + +func (cm *connsMap) CloseAll() { + cm.mu.Lock() + for c := range cm.m { + _ = c.Close() + } + cm.mu.Unlock() +} + +// NewServer returns new Server. +func NewServer(vminsertAddr, vmselectAddr string, storage *storage.Storage) (*Server, error) { + vminsertLN, err := netutil.NewTCPListener("vminsert", vminsertAddr) + if err != nil { + return nil, fmt.Errorf("unable to listen vminsertAddr %s: %s", vminsertAddr, err) + } + vmselectLN, err := netutil.NewTCPListener("vmselect", vmselectAddr) + if err != nil { + return nil, fmt.Errorf("unable to listen vmselectAddr %s: %s", vmselectAddr, err) + } + if err := encoding.CheckPrecisionBits(uint8(*precisionBits)); err != nil { + return nil, fmt.Errorf("invalid -precisionBits: %s", err) + } + + // Set network-level write timeouts to reasonable values in order to protect + // from broken networks. + // Do not set read timeouts, since they are managed separately - + // search for SetReadDeadline in this file. + vminsertLN.WriteTimeout = time.Minute + vmselectLN.WriteTimeout = time.Minute + + s := &Server{ + storage: storage, + + vminsertLN: vminsertLN, + vmselectLN: vmselectLN, + } + s.vminsertConnsMap.Init() + s.vmselectConnsMap.Init() + return s, nil +} + +// RunVMInsert runs a server accepting connections from vminsert. +func (s *Server) RunVMInsert() { + logger.Infof("accepting vminsert conns at %s", s.vminsertLN.Addr()) + for { + c, err := s.vminsertLN.Accept() + if err != nil { + if pe, ok := err.(net.Error); ok && pe.Temporary() { + continue + } + if s.isStopping() { + return + } + logger.Panicf("FATAL: cannot process vminsert conns at %s: %s", s.vminsertLN.Addr(), err) + } + logger.Infof("accepted vminsert conn from %s", c.RemoteAddr()) + + vminsertConns.Inc() + s.vminsertConnsMap.Add(c) + s.vminsertWG.Add(1) + go func() { + defer func() { + s.vminsertConnsMap.Delete(c) + vminsertConns.Dec() + s.vminsertWG.Done() + }() + + // There is no need in response compression, since + // vmstorage doesn't send anything back to vminsert. + compressionLevel := 0 + bc, err := handshake.VMInsertServer(c, compressionLevel) + if err != nil { + if s.isStopping() { + // c is stopped inside Server.MustClose + return + } + logger.Errorf("cannot perform vminsert handshake with client %q: %s", c.RemoteAddr(), err) + _ = c.Close() + return + } + defer func() { + if !s.isStopping() { + logger.Infof("closing vminsert conn from %s", c.RemoteAddr()) + } + _ = bc.Close() + }() + + logger.Infof("processing vminsert conn from %s", c.RemoteAddr()) + if err := s.processVMInsertConn(bc); err != nil { + if s.isStopping() { + return + } + vminsertConnErrors.Inc() + logger.Errorf("cannot process vminsert conn from %s: %s", c.RemoteAddr(), err) + } + }() + } +} + +var ( + vminsertConns = metrics.NewCounter("vm_vminsert_conns") + vminsertConnErrors = metrics.NewCounter("vm_vminsert_conn_errors_total") +) + +// RunVMSelect runs a server accepting connections from vmselect. +func (s *Server) RunVMSelect() { + logger.Infof("accepting vmselect conns at %s", s.vmselectLN.Addr()) + for { + c, err := s.vmselectLN.Accept() + if err != nil { + if pe, ok := err.(net.Error); ok && pe.Temporary() { + continue + } + if s.isStopping() { + return + } + logger.Panicf("FATAL: cannot process vmselect conns at %s: %s", s.vmselectLN.Addr(), err) + } + logger.Infof("accepted vmselect conn from %s", c.RemoteAddr()) + + vmselectConns.Inc() + s.vmselectConnsMap.Add(c) + s.vmselectWG.Add(1) + go func() { + defer func() { + s.vmselectConnsMap.Delete(c) + vmselectConns.Dec() + s.vmselectWG.Done() + }() + + // Do not compress responses to vmselect, since these responses + // already contain compressed data. + compressionLevel := 0 + bc, err := handshake.VMSelectServer(c, compressionLevel) + if err != nil { + if s.isStopping() { + // c is closed inside Server.MustClose + return + } + logger.Errorf("cannot perform vmselect handshake with client %q: %s", c.RemoteAddr(), err) + _ = c.Close() + return + } + + defer func() { + if !s.isStopping() { + logger.Infof("closing vmselect conn from %s", c.RemoteAddr()) + } + _ = bc.Close() + }() + + logger.Infof("processing vmselect conn from %s", c.RemoteAddr()) + if err := s.processVMSelectConn(bc); err != nil { + if s.isStopping() { + return + } + vmselectConnErrors.Inc() + logger.Errorf("cannot process vmselect conn %s: %s", c.RemoteAddr(), err) + } + }() + } +} + +var ( + vmselectConns = metrics.NewCounter("vm_vmselect_conns") + vmselectConnErrors = metrics.NewCounter("vm_vmselect_conn_errors_total") +) + +// MustClose gracefully closes the server, +// so it no longer touches s.storage after returning. +func (s *Server) MustClose() { + // Mark the server as stoping. + s.setIsStopping() + + // Stop accepting new connections from vminsert and vmselect. + if err := s.vminsertLN.Close(); err != nil { + logger.Panicf("FATAL: cannot close vminsert listener: %s", err) + } + if err := s.vmselectLN.Close(); err != nil { + logger.Panicf("FATAL: cannot close vmselect listener: %s", err) + } + + // Close existing connections from vminsert, so the goroutines + // processing these connections are finished. + s.vminsertConnsMap.CloseAll() + + // Close existing connections from vmselect, so the goroutines + // processing these connections are finished. + s.vmselectConnsMap.CloseAll() + + // Wait until all the goroutines processing vminsert and vmselect conns + // are finished. + s.vminsertWG.Wait() + s.vmselectWG.Wait() +} + +func (s *Server) setIsStopping() { + atomic.StoreUint64(&s.stopFlag, 1) +} + +func (s *Server) isStopping() bool { + return atomic.LoadUint64(&s.stopFlag) != 0 +} + +func (s *Server) processVMInsertConn(r io.Reader) error { + sizeBuf := make([]byte, 8) + var buf []byte + var mrs []storage.MetricRow + for { + if _, err := io.ReadFull(r, sizeBuf); err != nil { + if err == io.EOF { + // Remote end gracefully closed the connection. + return nil + } + return fmt.Errorf("cannot read packet size: %s", err) + } + packetSize := encoding.UnmarshalUint64(sizeBuf) + if packetSize > consts.MaxInsertPacketSize { + return fmt.Errorf("too big packet size: %d; shouldn't exceed %d", packetSize, consts.MaxInsertPacketSize) + } + buf = bytesutil.Resize(buf, int(packetSize)) + if _, err := io.ReadFull(r, buf); err != nil { + return fmt.Errorf("cannot read packet with size %d: %s", packetSize, err) + } + vminsertPacketsRead.Inc() + + // Read metric rows from the packet. + mrs = mrs[:0] + tail := buf + for len(tail) > 0 { + if len(mrs) < cap(mrs) { + mrs = mrs[:len(mrs)+1] + } else { + mrs = append(mrs, storage.MetricRow{}) + } + mr := &mrs[len(mrs)-1] + var err error + tail, err = mr.Unmarshal(tail) + if err != nil { + return fmt.Errorf("cannot unmarshal MetricRow: %s", err) + } + } + vminsertMetricsRead.Add(len(mrs)) + if err := s.storage.AddRows(mrs, uint8(*precisionBits)); err != nil { + return fmt.Errorf("cannot store metrics: %s", err) + } + } +} + +var ( + vminsertPacketsRead = metrics.NewCounter("vm_vminsert_packets_read_total") + vminsertMetricsRead = metrics.NewCounter("vm_vminsert_metrics_read_total") +) + +func (s *Server) processVMSelectConn(bc *handshake.BufferedConn) error { + ctx := &vmselectRequestCtx{ + bc: bc, + sizeBuf: make([]byte, 8), + } + for { + err := s.processVMSelectRequest(ctx) + n := atomic.LoadUint64(&ctx.sr.MissingMetricNamesForMetricID) + missingMetricNamesForMetricID.Add(int(n)) + if err != nil { + if err == io.EOF { + // Remote client gracefully closed the connection. + return nil + } + return fmt.Errorf("cannot process vmselect request: %s", err) + } + if err := bc.Flush(); err != nil { + return fmt.Errorf("cannot flush compressed buffers: %s", err) + } + } +} + +var missingMetricNamesForMetricID = metrics.NewCounter(`vm_missing_metric_names_for_metric_id_total`) + +type vmselectRequestCtx struct { + bc *handshake.BufferedConn + sizeBuf []byte + dataBuf []byte + + sq storage.SearchQuery + tfss []*storage.TagFilters + sr storage.Search +} + +func (ctx *vmselectRequestCtx) readUint32() (uint32, error) { + ctx.sizeBuf = bytesutil.Resize(ctx.sizeBuf, 4) + if _, err := io.ReadFull(ctx.bc, ctx.sizeBuf); err != nil { + if err == io.EOF { + return 0, err + } + return 0, fmt.Errorf("cannot read uint32: %s", err) + } + n := encoding.UnmarshalUint32(ctx.sizeBuf) + return n, nil +} + +func (ctx *vmselectRequestCtx) readDataBufBytes(maxDataSize int) error { + ctx.sizeBuf = bytesutil.Resize(ctx.sizeBuf, 8) + if _, err := io.ReadFull(ctx.bc, ctx.sizeBuf); err != nil { + if err == io.EOF { + return err + } + return fmt.Errorf("cannot read data size: %s", err) + } + dataSize := encoding.UnmarshalUint64(ctx.sizeBuf) + if dataSize > uint64(maxDataSize) { + return fmt.Errorf("too big data size: %d; it mustn't exceed %d bytes", dataSize, maxDataSize) + } + ctx.dataBuf = bytesutil.Resize(ctx.dataBuf, int(dataSize)) + if dataSize == 0 { + return nil + } + if _, err := io.ReadFull(ctx.bc, ctx.dataBuf); err != nil { + return fmt.Errorf("cannot read data with size %d: %s", dataSize, err) + } + return nil +} + +func (ctx *vmselectRequestCtx) writeDataBufBytes() error { + if err := ctx.writeUint64(uint64(len(ctx.dataBuf))); err != nil { + return fmt.Errorf("cannot write data size: %s", err) + } + if len(ctx.dataBuf) == 0 { + return nil + } + if _, err := ctx.bc.Write(ctx.dataBuf); err != nil { + return fmt.Errorf("cannot write data with size %d: %s", len(ctx.dataBuf), err) + } + return nil +} + +func (ctx *vmselectRequestCtx) writeString(s string) error { + ctx.dataBuf = append(ctx.dataBuf[:0], s...) + return ctx.writeDataBufBytes() +} + +func (ctx *vmselectRequestCtx) writeUint64(n uint64) error { + ctx.sizeBuf = encoding.MarshalUint64(ctx.sizeBuf[:0], n) + if _, err := ctx.bc.Write(ctx.sizeBuf); err != nil { + return fmt.Errorf("cannot write uint64 %d: %s", n, err) + } + return nil +} + +const maxRPCNameSize = 128 + +var zeroTime time.Time + +func (s *Server) processVMSelectRequest(ctx *vmselectRequestCtx) error { + // Read rpcName + // Do not set deadline on reading rpcName, since it may take a + // lot of time for idle connection. + if err := ctx.readDataBufBytes(maxRPCNameSize); err != nil { + if err == io.EOF { + // Remote client gracefully closed the connection. + return err + } + return fmt.Errorf("cannot read rpcName: %s", err) + } + + // Limit the time required for reading request args. + if err := ctx.bc.SetReadDeadline(time.Now().Add(5 * time.Second)); err != nil { + return fmt.Errorf("cannot set read deadline for reading request args: %s", err) + } + defer func() { + _ = ctx.bc.SetReadDeadline(zeroTime) + }() + + switch string(ctx.dataBuf) { + case "search_v2": + return s.processVMSelectSearchQuery(ctx) + case "labelValues": + return s.processVMSelectLabelValues(ctx) + case "labels": + return s.processVMSelectLabels(ctx) + case "seriesCount": + return s.processVMSelectSeriesCount(ctx) + case "deleteMetrics_v2": + return s.processVMSelectDeleteMetrics(ctx) + default: + return fmt.Errorf("unsupported rpcName: %q", ctx.dataBuf) + } +} + +const maxTagFiltersSize = 64 * 1024 + +func (s *Server) processVMSelectDeleteMetrics(ctx *vmselectRequestCtx) error { + vmselectDeleteMetricsRequests.Inc() + + // Read request + if err := ctx.readDataBufBytes(maxTagFiltersSize); err != nil { + return fmt.Errorf("cannot read labelName: %s", err) + } + tail, err := ctx.sq.Unmarshal(ctx.dataBuf) + if err != nil { + return fmt.Errorf("cannot unmarshal SearchQuery: %s", err) + } + if len(tail) > 0 { + return fmt.Errorf("unexpected non-zero tail left after unmarshaling SearchQuery: (len=%d) %q", len(tail), tail) + } + + // Setup ctx.tfss + if err := ctx.setupTfss(); err != nil { + // Send the error message to vmselect. + errMsg := err.Error() + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Delete the given metrics. + deletedCount, err := s.storage.DeleteMetrics(ctx.tfss) + if err != nil { + if err := ctx.writeString(err.Error()); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Send an empty error message to vmselect. + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send empty error message: %s", err) + } + // Send deletedCount to vmselect. + if err := ctx.writeUint64(uint64(deletedCount)); err != nil { + return fmt.Errorf("cannot send deletedCount=%d: %s", deletedCount, err) + } + return nil +} + +func (s *Server) processVMSelectLabels(ctx *vmselectRequestCtx) error { + vmselectLabelsRequests.Inc() + + // Read request + accountID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read accountID: %s", err) + } + projectID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read projectID: %s", err) + } + + // Search for tag keys + labels, err := s.storage.SearchTagKeys(accountID, projectID, *maxTagKeysPerSearch) + if err != nil { + // Send the error message to vmselect. + errMsg := fmt.Sprintf("error during labels search: %s", err) + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Send an empty error message to vmselect. + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send empty error message: %s", err) + } + + // Send labels to vmselect + for _, label := range labels { + if len(label) == 0 { + // Do this substitution in order to prevent clashing with 'end of response' marker. + label = "__name__" + } + if err := ctx.writeString(label); err != nil { + return fmt.Errorf("cannot write label %q: %s", label, err) + } + } + + // Send 'end of response' marker + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send 'end of response' marker") + } + return nil +} + +const maxLabelValueSize = 16 * 1024 + +func (s *Server) processVMSelectLabelValues(ctx *vmselectRequestCtx) error { + vmselectLabelValuesRequests.Inc() + + // Read request + accountID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read accountID: %s", err) + } + projectID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read projectID: %s", err) + } + if err := ctx.readDataBufBytes(maxLabelValueSize); err != nil { + return fmt.Errorf("cannot read labelName: %s", err) + } + labelName := ctx.dataBuf + + // Search for tag values + labelValues, err := s.storage.SearchTagValues(accountID, projectID, labelName, *maxTagValuesPerSearch) + if err != nil { + // Send the error message to vmselect. + errMsg := fmt.Sprintf("error during label values search for labelName=%q: %s", labelName, err) + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Send an empty error message to vmselect. + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send empty error message: %s", err) + } + + // Send labelValues to vmselect + for _, labelValue := range labelValues { + if len(labelValue) == 0 { + // Skip empty label values, since they have no sense for prometheus. + continue + } + if err := ctx.writeString(labelValue); err != nil { + return fmt.Errorf("cannot write labelValue %q: %s", labelValue, err) + } + } + + // Send 'end of response' marker + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send 'end of response' marker") + } + return nil +} + +func (s *Server) processVMSelectSeriesCount(ctx *vmselectRequestCtx) error { + vmselectSeriesCountRequests.Inc() + + // Read request + accountID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read accountID: %s", err) + } + projectID, err := ctx.readUint32() + if err != nil { + return fmt.Errorf("cannot read projectID: %s", err) + } + + // Execute the request + n, err := s.storage.GetSeriesCount(accountID, projectID) + if err != nil { + // Send the error message to vmselect. + errMsg := fmt.Sprintf("error during obtaining series count: %s", err) + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Send an empty error message to vmselect. + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send empty error message: %s", err) + } + + // Send series count to vmselect. + if err := ctx.writeUint64(n); err != nil { + return fmt.Errorf("cannot write series count to vmselect: %s", err) + } + return nil +} + +// maxSearchQuerySize is the maximum size of SearchQuery packet in bytes. +const maxSearchQuerySize = 1024 * 1024 + +func (s *Server) processVMSelectSearchQuery(ctx *vmselectRequestCtx) error { + vmselectSearchQueryRequests.Inc() + + // Read search query. + if err := ctx.readDataBufBytes(maxSearchQuerySize); err != nil { + return fmt.Errorf("cannot read searchQuery: %s", err) + } + tail, err := ctx.sq.Unmarshal(ctx.dataBuf) + if err != nil { + return fmt.Errorf("cannot unmarshal SearchQuery: %s", err) + } + if len(tail) > 0 { + return fmt.Errorf("unexpected non-zero tail left after unmarshaling SearchQuery: (len=%d) %q", len(tail), tail) + } + + // Setup search. + if err := ctx.setupTfss(); err != nil { + // Send the error message to vmselect. + errMsg := err.Error() + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + tr := storage.TimeRange{ + MinTimestamp: ctx.sq.MinTimestamp, + MaxTimestamp: ctx.sq.MaxTimestamp, + } + ctx.sr.Init(s.storage, ctx.tfss, tr, *maxMetricsPerSearch) + defer ctx.sr.MustClose() + if err := ctx.sr.Error(); err != nil { + // Send the error message to vmselect. + errMsg := fmt.Sprintf("search error: %s", err) + if err := ctx.writeString(errMsg); err != nil { + return fmt.Errorf("cannot send error message: %s", err) + } + return nil + } + + // Send empty error message to vmselect. + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send empty error message: %s", err) + } + + // Send found blocks to vmselect. + for ctx.sr.NextMetricBlock() { + mb := ctx.sr.MetricBlock + + vmselectMetricBlocksRead.Inc() + vmselectMetricRowsRead.Add(mb.Block.RowsCount()) + + ctx.dataBuf = mb.Marshal(ctx.dataBuf[:0]) + if err := ctx.writeDataBufBytes(); err != nil { + return fmt.Errorf("cannot send MetricBlock: %s", err) + } + } + if err := ctx.sr.Error(); err != nil { + return fmt.Errorf("search error: %s", err) + } + + // Send 'end of response' marker + if err := ctx.writeString(""); err != nil { + return fmt.Errorf("cannot send 'end of response' marker") + } + return nil +} + +var ( + vmselectDeleteMetricsRequests = metrics.NewCounter("vm_vmselect_delete_metrics_requests_total") + vmselectLabelsRequests = metrics.NewCounter("vm_vmselect_labels_requests_total") + vmselectLabelValuesRequests = metrics.NewCounter("vm_vmselect_label_values_requests_total") + vmselectSeriesCountRequests = metrics.NewCounter("vm_vmselect_series_count_requests_total") + vmselectSearchQueryRequests = metrics.NewCounter("vm_vmselect_search_query_requests_total") + vmselectMetricBlocksRead = metrics.NewCounter("vm_vmselect_metric_blocks_read_total") + vmselectMetricRowsRead = metrics.NewCounter("vm_vmselect_metric_rows_read_total") +) + +func (ctx *vmselectRequestCtx) setupTfss() error { + tfss := ctx.tfss[:0] + for _, tagFilters := range ctx.sq.TagFilterss { + if len(tfss) < cap(tfss) { + tfss = tfss[:len(tfss)+1] + } else { + tfss = append(tfss, &storage.TagFilters{}) + } + tfs := tfss[len(tfss)-1] + tfs.Reset(ctx.sq.AccountID, ctx.sq.ProjectID) + for i := range tagFilters { + tf := &tagFilters[i] + if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil { + return fmt.Errorf("cannot parse tag filter %s: %s", tf, err) + } + } + } + ctx.tfss = tfss + return nil +} diff --git a/deployment/docker/Makefile b/deployment/docker/Makefile index 84537632f..a57f1f616 100644 --- a/deployment/docker/Makefile +++ b/deployment/docker/Makefile @@ -1,3 +1,5 @@ +# All these commands must run from repository root. + DOCKER_NAMESPACE := valyala BUILDER_IMAGE := local/builder:go1.12.5 CERTS_IMAGE := local/certs:1.0.2 diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml new file mode 100644 index 000000000..cb1f8b3b7 --- /dev/null +++ b/deployment/docker/docker-compose.yml @@ -0,0 +1,68 @@ +version: '3.5' +services: + prometheus: + container_name: prometheus + image: prom/prometheus:v2.3.2 + depends_on: + - "vminsert" + - "vmselect" + ports: + - 9090:9090 + volumes: + - promdata:/prometheus + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + networks: + - docker_net + restart: always + vmstorage: + container_name: vmstorage + image: valyala/vmstorage:heads-cluster-0-gca0d4847 + ports: + - 8482:8482 + - 8400:8400 + - 8401:8401 + volumes: + - strgdata:/storage + command: + - '--storageDataPath=/storage' + - '--vminsertAddr=:8401' + - '--vmselectAddr=:8400' + - '--httpListenAddr=:8482' + networks: + - docker_net + restart: always + vmselect: + container_name: vmselect + image: valyala/vmselect:heads-cluster-0-gca0d4847 + depends_on: + - "vmstorage" + ports: + - 8480:8480 + command: + - '--storageNode=vmstorage:8400' + networks: + - docker_net + restart: always + vminsert: + container_name: vminsert + image: valyala/vminsert:heads-cluster-0-gca0d4847 + depends_on: + - "vmstorage" + command: + - '--storageNode=vmstorage:8401' + ports: + - 8481:8481 + networks: + - docker_net + restart: always +volumes: + promdata: {} + strgdata: {} +networks: + docker_net: + driver: bridge diff --git a/deployment/docker/prometheus.yml b/deployment/docker/prometheus.yml new file mode 100644 index 000000000..dc2f7d4a0 --- /dev/null +++ b/deployment/docker/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 10s + evaluation_interval: 10s + +remote_write: + - url: "http://vminsert:8480/insert/0/prometheus/" + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['prometheus:9090'] + + - job_name: 'vminsert' + static_configs: + - targets: ['vminsert:8480'] + + - job_name: 'vmselect' + static_configs: + - targets: ['vmselect:8481'] + + - job_name: 'vmstorage' + static_configs: + - targets: ['vmstorage:8482'] diff --git a/deployment/k8s/helm/Makefile b/deployment/k8s/helm/Makefile new file mode 100644 index 000000000..45509957d --- /dev/null +++ b/deployment/k8s/helm/Makefile @@ -0,0 +1,26 @@ +# All these commands must run from repository root. + +HELM_PROJECT=victoria-metrics +HELM_PATH=deployment/k8s/helm/${HELM_PROJECT} +HELM_APP_VERSION=1.0 + +helm-init: + @helm init + +helm-install: + helm install $(HELM_PATH) -n $(ENV) + +helm-install-dev: + ENV=dev $(MAKE) helm-install + +helm-upgrade: + helm upgrade $(ENV) $(HELM_PATH) + +helm-upgrade-dev: + ENV=dev $(MAKE) helm-upgrade + +helm-delete: + helm del --purge $(ENV) + +helm-delete-dev: + ENV=dev $(MAKE) helm-delete diff --git a/deployment/k8s/helm/README.md b/deployment/k8s/helm/README.md new file mode 100644 index 000000000..cf7bf6867 --- /dev/null +++ b/deployment/k8s/helm/README.md @@ -0,0 +1,37 @@ +### Victoria metrics helm chart + +#### Create cluster from chart + +```$bash +$ ENV= make helm-install +``` + +for DEV env : + +```$bash +$ make helm-install-dev +``` + +#### Upgrade cluster from chart + +```$bash +$ ENV= make helm-upgrade +``` + +for DEV env : + +```$bash +$ make helm-upgrade-dev +``` + +#### Delete chart from cluster + +```$bash +$ ENV= make helm-delete +``` + +for DEV env : + +```$bash +$ make helm-delete-dev +``` diff --git a/deployment/k8s/helm/victoria-metrics/.helmignore b/deployment/k8s/helm/victoria-metrics/.helmignore new file mode 100644 index 000000000..50af03172 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deployment/k8s/helm/victoria-metrics/Chart.yaml b/deployment/k8s/helm/victoria-metrics/Chart.yaml new file mode 100644 index 000000000..48c2d77b2 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: victoria-metrics +version: 0.1.0 diff --git a/deployment/k8s/helm/victoria-metrics/README.md b/deployment/k8s/helm/victoria-metrics/README.md new file mode 100644 index 000000000..c89442c67 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/README.md @@ -0,0 +1,8 @@ +# Victoria Metrics + +## TL;DR; + +1. Install helm chart. Check the output. +2. Specify Remote Write URL in Prometheus. +3. Configure Grafana's Prometheus Data Source. + diff --git a/deployment/k8s/helm/victoria-metrics/templates/NOTES.txt b/deployment/k8s/helm/victoria-metrics/templates/NOTES.txt new file mode 100644 index 000000000..a276072d2 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/NOTES.txt @@ -0,0 +1,76 @@ +{{ if .Values.vminsert.enabled }} +Write API: + +The Victoria Metrics write api can be accessed via port {{ .Values.vmselect.service.servicePort }} on the following DNS name from within your cluster: +{{ template "victoria-metrics.vminsert.fullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomainSuffix }} + +Get the Victoria Metrics insert service URL by running these commands in the same shell: +{{- if contains "NodePort" .Values.vminsert.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "victoria-metrics.vminsert.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.vminsert.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "victoria-metrics.vminsert.fullname" . }}' + + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "victoria-metrics.vminsert.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo http://$SERVICE_IP:{{ .Values.vminsert.service.servicePort }} +{{- else if contains "ClusterIP" .Values.vminsert.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ .Values.vminsert.name }}" -o jsonpath="{.items[0].metadata.name}") + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8480 +{{- end }} + +You need to update your prometheus configuration file and add next lines into it: + +prometheus.yml +```yaml +remote_write: + - url: "http:///insert/{{ .Values.vmstorage.retentionPeriod }}m/1/{{.Release.Name}}/prometheus/" + +``` + +for e.g. inside the kubernetes cluster: +```yaml +remote_write: + - url: "http://{{ template "victoria-metrics.vminsert.fullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomainSuffix }}:{{ .Values.vminsert.service.servicePort }}/insert/{{ .Values.vmstorage.retentionPeriod }}m/1/{{.Release.Name}}/prometheus/" + +``` +{{- end }} + +{{- if .Values.vmselect.enabled }} +Read API: + +The Victoria Metrics read api can be accessed via port {{ .Values.vmselect.service.servicePort }} on the following DNS name from within your cluster: +{{ template "victoria-metrics.vmselect.fullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomainSuffix }} + +Get the Victoria Metrics select service URL by running these commands in the same shell: +{{- if contains "NodePort" .Values.vmselect.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "victoria-metrics.vminsert.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.vmselect.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "victoria-metrics.vminsert.fullname" . }}' + + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "victoria-metrics.vmselect.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo http://$SERVICE_IP:{{ .Values.vmselect.service.servicePort }} +{{- else if contains "ClusterIP" .Values.vmselect.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ .Values.vmselect.name }}" -o jsonpath="{.items[0].metadata.name}") + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8481 +{{- end }} + +You need to update specify select service URL in your Grafana: + NOTE: you need to use Prometheus Data Source + +Input for URL field in Grafana + +``` +http:///select/{{ .Values.vmstorage.retentionPeriod }}m/1/{{.Release.Name}}/prometheus/ +``` + +for e.g. inside the kubernetes cluster: +``` +http://{{ template "victoria-metrics.vmselect.fullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomainSuffix }}:{{ .Values.vmselect.service.servicePort }}/select/{{ .Values.vmstorage.retentionPeriod }}m/1/{{.Release.Name}}/prometheus/" +``` +{{- end }} + diff --git a/deployment/k8s/helm/victoria-metrics/templates/_helpers.tpl b/deployment/k8s/helm/victoria-metrics/templates/_helpers.tpl new file mode 100644 index 000000000..7ed138313 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/_helpers.tpl @@ -0,0 +1,129 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "victoria-metrics.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "victoria-metrics.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create unified labels for victoria-metrics components +*/}} +{{- define "victoria-metrics.common.matchLabels" -}} +app.kubernetes.io/name: {{ include "victoria-metrics.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{- define "victoria-metrics.common.metaLabels" -}} +helm.sh/chart: {{ include "victoria-metrics.chart" . }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{- define "victoria-metrics.vmstorage.labels" -}} +{{ include "victoria-metrics.vmstorage.matchLabels" . }} +{{ include "victoria-metrics.common.metaLabels" . }} +{{- end -}} + +{{- define "victoria-metrics.vmstorage.matchLabels" -}} +app: {{ .Values.vmstorage.name }} +{{ include "victoria-metrics.common.matchLabels" . }} +{{- end -}} + +{{- define "victoria-metrics.vmselect.labels" -}} +{{ include "victoria-metrics.vmselect.matchLabels" . }} +{{ include "victoria-metrics.common.metaLabels" . }} +{{- end -}} + +{{- define "victoria-metrics.vmselect.matchLabels" -}} +app: {{ .Values.vmselect.name }} +{{ include "victoria-metrics.common.matchLabels" . }} +{{- end -}} + +{{- define "victoria-metrics.vminsert.labels" -}} +{{ include "victoria-metrics.vminsert.matchLabels" . }} +{{ include "victoria-metrics.common.metaLabels" . }} +{{- end -}} + +{{- define "victoria-metrics.vminsert.matchLabels" -}} +app: {{ .Values.vminsert.name }} +{{ include "victoria-metrics.common.matchLabels" . }} +{{- end -}} + +{{/* +Create a fully qualified vmstorage name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +*/}} +{{- define "victoria-metrics.vmstorage.fullname" -}} +{{- if .Values.vmstorage.fullnameOverride -}} +{{- .Values.vmstorage.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- printf "%s-%s" .Release.Name .Values.server.name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s-%s" .Release.Name $name .Values.vmstorage.name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create a fully qualified vmselect name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +*/}} +{{- define "victoria-metrics.vmselect.fullname" -}} +{{- if .Values.vmselect.fullnameOverride -}} +{{- .Values.vmselect.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- printf "%s-%s" .Release.Name .Values.vmselect.name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s-%s" .Release.Name $name .Values.vmselect.name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create a fully qualified vmselect name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +*/}} +{{- define "victoria-metrics.vminsert.fullname" -}} +{{- if .Values.vminsert.fullnameOverride -}} +{{- .Values.vminsert.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- printf "%s-%s" .Release.Name .Values.vminsert.name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s-%s" .Release.Name $name .Values.vminsert.name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "victoria-metrics.vmselect.vmstorage-pod-fqdn" -}} +{{- $pod := include "victoria-metrics.vmstorage.fullname" . -}} +{{- $svc := include "victoria-metrics.vmstorage.fullname" . -}} +{{- $namespace := .Release.Namespace -}} +{{- $dnsSuffix := .Values.clusterDomainSuffix -}} +{{- range $i := until (.Values.vmstorage.replicaCount | int) -}} +{{- printf "- --storageNode=%s-%d.%s.%s.svc.%s:8400\n" $pod $i $svc $namespace $dnsSuffix -}} +{{- end -}} +{{- end -}} + +{{- define "victoria-metrics.vminsert.vmstorage-pod-fqdn" -}} +{{- $pod := include "victoria-metrics.vmstorage.fullname" . -}} +{{- $svc := include "victoria-metrics.vmstorage.fullname" . -}} +{{- $namespace := .Release.Namespace -}} +{{- $dnsSuffix := .Values.clusterDomainSuffix -}} +{{- range $i := until (.Values.vmstorage.replicaCount | int) -}} +{{- printf "- --storageNode=%s-%d.%s.%s.svc.%s:8401\n" $pod $i $svc $namespace $dnsSuffix -}} +{{- end -}} +{{- end -}} + diff --git a/deployment/k8s/helm/victoria-metrics/templates/vminsert-deployment.yaml b/deployment/k8s/helm/victoria-metrics/templates/vminsert-deployment.yaml new file mode 100644 index 000000000..1797722c9 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vminsert-deployment.yaml @@ -0,0 +1,65 @@ +{{- if .Values.vminsert.enabled -}} +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + {{- include "victoria-metrics.vminsert.labels" . | nindent 4 }} + name: {{ template "victoria-metrics.vminsert.fullname" . }} +spec: + selector: + matchLabels: + {{- include "victoria-metrics.vminsert.matchLabels" . | nindent 6 }} + replicas: {{ .Values.vminsert.replicaCount }} + template: + metadata: + {{- if .Values.vminsert.podAnnotations }} + annotations: +{{ toYaml .Values.vminsert.podAnnotations | indent 8 }} + {{- end }} + labels: + {{- include "victoria-metrics.vminsert.labels" . | nindent 8 }} + spec: +{{- if .Values.vminsert.priorityClassName }} + priorityClassName: "{{ .Values.vminsert.priorityClassName }}" +{{- end }} + containers: + - name: {{ template "victoria-metrics.name" . }}-{{ .Values.vminsert.name }} + image: "{{ .Values.vminsert.image.repository }}:{{ .Values.vminsert.image.tag }}" + imagePullPolicy: "{{ .Values.vminsert.image.pullPolicy }}" + args: + {{- include "victoria-metrics.vminsert.vmstorage-pod-fqdn" . | nindent 12 }} + {{- range $key, $value := .Values.vminsert.extraArgs }} + - --{{ $key }}={{ $value }} + {{- end }} + ports: + - name: http + containerPort: 8480 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 15 + resources: +{{ toYaml .Values.vminsert.resources | indent 12 }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | indent 2 }} + {{- end }} + {{- if .Values.vminsert.nodeSelector }} + nodeSelector: +{{ toYaml .Values.vminsert.nodeSelector | indent 8 }} + {{- end }} + {{- if .Values.vminsert.securityContext }} + securityContext: +{{ toYaml .Values.vminsert.securityContext | indent 8 }} + {{- end }} + {{- if .Values.vminsert.tolerations }} + tolerations: +{{ toYaml .Values.vminsert.tolerations | indent 8 }} + {{- end }} + {{- if .Values.vminsert.affinity }} + affinity: +{{ toYaml .Values.vminsert.affinity | indent 8 }} + {{- end }} +{{- end }} diff --git a/deployment/k8s/helm/victoria-metrics/templates/vminsert-service.yaml b/deployment/k8s/helm/victoria-metrics/templates/vminsert-service.yaml new file mode 100644 index 000000000..48c36f1f4 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vminsert-service.yaml @@ -0,0 +1,40 @@ +{{- if .Values.vminsert.enabled -}} +apiVersion: v1 +kind: Service +metadata: +{{- if .Values.vminsert.service.annotations }} + annotations: +{{ toYaml .Values.vminsert.service.annotations | indent 4}} +{{- end }} + labels: + {{- include "victoria-metrics.vminsert.labels" . | nindent 4 }} +{{- if .Values.vminsert.service.labels }} +{{ toYaml .Values.vminsert.service.labels | indent 4}} +{{- end }} + name: {{ template "victoria-metrics.vminsert.fullname" . }} +spec: +{{- if .Values.vminsert.service.clusterIP }} + clusterIP: {{ .Values.vminsert.service.clusterIP }} +{{- end }} +{{- if .Values.vminsert.service.externalIPs }} + externalIPs: +{{ toYaml .Values.vminsert.service.externalIPs | indent 4 }} +{{- end }} +{{- if .Values.vminsert.service.loadBalancerIP }} + loadBalancerIP: {{ .Values.vminsert.service.loadBalancerIP }} +{{- end }} +{{- if .Values.vminsert.service.loadBalancerSourceRanges }} + loadBalancerSourceRanges: + {{- range $cidr := .Values.vminsert.service.loadBalancerSourceRanges }} + - {{ $cidr }} + {{- end }} +{{- end }} + ports: + - name: http + port: {{ .Values.vminsert.service.servicePort }} + protocol: TCP + targetPort: http + selector: + {{- include "victoria-metrics.vminsert.matchLabels" . | nindent 4 }} + type: "{{ .Values.vminsert.service.type }}" +{{- end }} diff --git a/deployment/k8s/helm/victoria-metrics/templates/vmselect-deployment.yaml b/deployment/k8s/helm/victoria-metrics/templates/vmselect-deployment.yaml new file mode 100644 index 000000000..2d987f4b7 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vmselect-deployment.yaml @@ -0,0 +1,72 @@ +{{- if .Values.vmselect.enabled -}} +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + {{- include "victoria-metrics.vmselect.labels" . | nindent 4 }} + name: {{ template "victoria-metrics.vmselect.fullname" . }} +spec: + selector: + matchLabels: + {{- include "victoria-metrics.vmselect.matchLabels" . | nindent 6 }} + replicas: {{ .Values.vmselect.replicaCount }} + template: + metadata: + {{- if .Values.vmselect.podAnnotations }} + annotations: +{{ toYaml .Values.vmselect.podAnnotations | indent 8 }} + {{- end }} + labels: + {{- include "victoria-metrics.vmselect.labels" . | nindent 8 }} + spec: +{{- if .Values.vmselect.priorityClassName }} + priorityClassName: "{{ .Values.vmselect.priorityClassName }}" +{{- end }} + containers: + - name: {{ template "victoria-metrics.name" . }}-{{ .Values.vmselect.name }} + image: "{{ .Values.vmselect.image.repository }}:{{ .Values.vmselect.image.tag }}" + imagePullPolicy: "{{ .Values.vmselect.image.pullPolicy }}" + args: + - {{ printf "%s=%s" "--cacheDataPath" .Values.vmselect.cacheMountPath | quote}} + {{- include "victoria-metrics.vmselect.vmstorage-pod-fqdn" . | nindent 12 }} + {{- range $key, $value := .Values.vmselect.extraArgs }} + - --{{ $key }}={{ $value }} + {{- end }} + ports: + - name: http + containerPort: 8481 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 15 + volumeMounts: + - mountPath: {{ .Values.vmselect.cacheMountPath }} + name: cache-volume + resources: +{{ toYaml .Values.vmselect.resources | indent 12 }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | indent 2 }} + {{- end }} + {{- if .Values.vmselect.nodeSelector }} + nodeSelector: +{{ toYaml .Values.vmselect.nodeSelector | indent 8 }} + {{- end }} + {{- if .Values.vmselect.securityContext }} + securityContext: +{{ toYaml .Values.vmselect.securityContext | indent 8 }} + {{- end }} + {{- if .Values.vmselect.tolerations }} + tolerations: +{{ toYaml .Values.vmselect.tolerations | indent 8 }} + {{- end }} + {{- if .Values.vmselect.affinity }} + affinity: +{{ toYaml .Values.vmselect.affinity | indent 8 }} + {{- end }} + volumes: + - name: cache-volume + emptyDir: {} +{{- end }} diff --git a/deployment/k8s/helm/victoria-metrics/templates/vmselect-service.yaml b/deployment/k8s/helm/victoria-metrics/templates/vmselect-service.yaml new file mode 100644 index 000000000..f92120d44 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vmselect-service.yaml @@ -0,0 +1,40 @@ +{{- if .Values.vmselect.enabled -}} +apiVersion: v1 +kind: Service +metadata: +{{- if .Values.vmselect.service.annotations }} + annotations: +{{ toYaml .Values.vmselect.service.annotations | indent 4}} +{{- end }} + labels: + {{- include "victoria-metrics.vmselect.labels" . | nindent 4 }} +{{- if .Values.vmselect.service.labels }} +{{ toYaml .Values.vmselect.service.labels | indent 4}} +{{- end }} + name: {{ template "victoria-metrics.vmselect.fullname" . }} +spec: +{{- if .Values.vmselect.service.clusterIP }} + clusterIP: {{ .Values.vmselect.service.clusterIP }} +{{- end }} +{{- if .Values.vmselect.service.externalIPs }} + externalIPs: +{{ toYaml .Values.vmselect.service.externalIPs | indent 4 }} +{{- end }} +{{- if .Values.vmselect.service.loadBalancerIP }} + loadBalancerIP: {{ .Values.vmselect.service.loadBalancerIP }} +{{- end }} +{{- if .Values.vmselect.service.loadBalancerSourceRanges }} + loadBalancerSourceRanges: + {{- range $cidr := .Values.vmselect.service.loadBalancerSourceRanges }} + - {{ $cidr }} + {{- end }} +{{- end }} + ports: + - name: http + port: {{ .Values.vmselect.service.servicePort }} + protocol: TCP + targetPort: http + selector: + {{- include "victoria-metrics.vmselect.matchLabels" . | nindent 4 }} + type: "{{ .Values.vmselect.service.type }}" +{{- end }} diff --git a/deployment/k8s/helm/victoria-metrics/templates/vmstorage-service.yaml b/deployment/k8s/helm/victoria-metrics/templates/vmstorage-service.yaml new file mode 100644 index 000000000..4b694907b --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vmstorage-service.yaml @@ -0,0 +1,32 @@ +{{- if .Values.vmstorage.enabled -}} +apiVersion: v1 +kind: Service +metadata: +{{- if .Values.vmstorage.service.annotations }} + annotations: +{{ toYaml .Values.vmstorage.service.annotations | indent 4 }} +{{- end }} + labels: + {{- include "victoria-metrics.vmstorage.labels" . | nindent 4 }} +{{- if .Values.vmstorage.service.labels }} +{{ toYaml .Values.vmstorage.service.labels | indent 4 }} +{{- end }} + name: {{ template "victoria-metrics.vmstorage.fullname" . }} +spec: + clusterIP: None + ports: + - port: {{ .Values.vmstorage.service.servicePort }} + targetPort: http + protocol: TCP + name: http + - port: {{ .Values.vmstorage.service.vmselectPort }} + targetPort: vmselect + protocol: TCP + name: vmselect + - port: {{ .Values.vmstorage.service.vminsertPort }} + targetPort: vminsert + protocol: TCP + name: vminsert + selector: + {{- include "victoria-metrics.vmstorage.matchLabels" . | nindent 4 }} +{{- end -}} diff --git a/deployment/k8s/helm/victoria-metrics/templates/vmstorage-statefulset.yaml b/deployment/k8s/helm/victoria-metrics/templates/vmstorage-statefulset.yaml new file mode 100644 index 000000000..367350358 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/templates/vmstorage-statefulset.yaml @@ -0,0 +1,167 @@ +{{- if .Values.vmstorage.enabled -}} +apiVersion: apps/v1 +kind: StatefulSet +metadata: +{{- if .Values.vmstorage.annotations }} + annotations: +{{ toYaml .Values.vmstorage.annotations | indent 4 }} +{{- end }} + labels: + {{- include "victoria-metrics.vmstorage.labels" . | nindent 4 }} + name: {{ template "victoria-metrics.vmstorage.fullname" . }} +spec: + serviceName: {{ template "victoria-metrics.vmstorage.fullname" . }} + selector: + matchLabels: + {{- include "victoria-metrics.vmstorage.matchLabels" . | nindent 6 }} + replicas: {{ .Values.vmstorage.replicaCount }} + podManagementPolicy: {{ .Values.vmstorage.podManagementPolicy }} + template: + metadata: + {{- if .Values.vmstorage.podAnnotations }} + annotations: +{{ toYaml .Values.vmstorage.podAnnotations | indent 8 }} + {{- end }} + labels: + {{- include "victoria-metrics.vmstorage.labels" . | nindent 8 }} + spec: +{{- if .Values.vmstorage.affinity }} + affinity: +{{ toYaml .Values.vmstorage.affinity | indent 8 }} +{{- end }} +{{- if .Values.vmstorage.priorityClassName }} + priorityClassName: "{{ .Values.vmstorage.priorityClassName }}" +{{- end }} +{{- if .Values.vmstorage.schedulerName }} + schedulerName: "{{ .Values.vmstorage.schedulerName }}" +{{- end }} + containers: + - name: {{ template "victoria-metrics.name" . }}-{{ .Values.vmstorage.name }} + image: "{{ .Values.vmstorage.image.repository }}:{{ .Values.vmstorage.image.tag }}" + imagePullPolicy: "{{ .Values.vmstorage.image.pullPolicy }}" + args: + - {{ printf "%s=%d" "--retentionPeriod" (int .Values.vmstorage.retentionPeriod) | quote}} + - {{ printf "%s=%s" "--storageDataPath" .Values.vmstorage.persistentVolume.mountPath | quote}} + - '--vminsertAddr=:8401' + - '--vmselectAddr=:8400' + - '--httpListenAddr=:8482' + {{- range $key, $value := .Values.vmstorage.extraArgs }} + - --{{ $key }}={{ $value }} + {{- end }} + ports: + - name: http + containerPort: 8482 + - name: vmselect + containerPort: 8400 + - name: vminsert + containerPort: 8401 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 15 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 15 + timeoutSeconds: 5 + resources: +{{ toYaml .Values.vmstorage.resources | indent 12 }} + volumeMounts: + - name: vmstorage-volume + mountPath: {{ .Values.vmstorage.persistentVolume.mountPath }} + subPath: {{ .Values.vmstorage.persistentVolume.subPath }} + {{- range .Values.vmstorage.extraHostPathMounts }} + - name: {{ .name }} + mountPath: {{ .mountPath }} + subPath: {{ .subPath }} + readOnly: {{ .readOnly }} + {{- end }} + {{- range .Values.vmstorage.extraConfigmapMounts }} + - name: {{ $.Values.vmstorage.name }}-{{ .name }} + mountPath: {{ .mountPath }} + subPath: {{ .subPath }} + readOnly: {{ .readOnly }} + {{- end }} + {{- range .Values.vmstorage.extraSecretMounts }} + - name: {{ .name }} + mountPath: {{ .mountPath }} + subPath: {{ .subPath }} + readOnly: {{ .readOnly }} + {{- end }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | indent 2 }} + {{- end }} + {{- if .Values.vmstorage.nodeSelector }} + nodeSelector: +{{ toYaml .Values.vmstorage.nodeSelector | indent 8 }} + {{- end }} + {{- if .Values.vmstorage.securityContext }} + securityContext: +{{ toYaml .Values.vmstorage.securityContext | indent 8 }} + {{- end }} + {{- if .Values.vmstorage.tolerations }} + tolerations: +{{ toYaml .Values.vmstorage.tolerations | indent 8 }} + {{- end }} + {{- if .Values.vmstorage.affinity }} + affinity: +{{ toYaml .Values.vmstorage.affinity | indent 8 }} + {{- end }} + terminationGracePeriodSeconds: {{ .Values.vmstorage.terminationGracePeriodSeconds }} + volumes: + {{- range .Values.vmstorage.extraHostPathMounts }} + - name: {{ .name }} + hostPath: + path: {{ .hostPath }} + {{- end }} + {{- range .Values.vmstorage.extraConfigmapMounts }} + - name: {{ $.Values.vmstorage.name }}-{{ .name }} + configMap: + name: {{ .configMap }} + {{- end }} + {{- range .Values.vmstorage.extraConfigmapMounts }} + - name: {{ $.Values.vmstorage.name }}-{{ .name }} + configMap: + name: {{ .configMap }} + {{- end }} + {{- range .Values.vmstorage.extraSecretMounts }} + - name: {{ .name }} + secret: + secretName: {{ .secretName }} + {{- end }} + {{- range .Values.vmstorage.extraConfigmapMounts }} + - name: {{ .name }} + configMap: + name: {{ .configMap }} + {{- end }} +{{- if .Values.vmstorage.persistentVolume.enabled }} + volumeClaimTemplates: + - metadata: + name: vmstorage-volume + {{- if .Values.vmstorage.persistentVolume.annotations }} + annotations: +{{ toYaml .Values.vmstorage.persistentVolume.annotations | indent 10 }} + {{- end }} + spec: + accessModes: +{{ toYaml .Values.vmstorage.persistentVolume.accessModes | indent 10 }} + resources: + requests: + storage: "{{ .Values.vmstorage.persistentVolume.size }}" + {{- if .Values.vmstorage.persistentVolume.storageClass }} + {{- if (eq "-" .Values.vmstorage.persistentVolume.storageClass) }} + storageClassName: "" + {{- else }} + storageClassName: "{{ .Values.vmstorage.persistentVolume.storageClass }}" + {{- end }} + {{- end }} +{{- else }} + - name: vmstorage-volume + emptyDir: {} +{{- end }} +{{- end }} diff --git a/deployment/k8s/helm/victoria-metrics/values.yaml b/deployment/k8s/helm/victoria-metrics/values.yaml new file mode 100644 index 000000000..28fa09c93 --- /dev/null +++ b/deployment/k8s/helm/victoria-metrics/values.yaml @@ -0,0 +1,213 @@ +# Default values for victoria-metrics. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +## Tour k8s cluster domain suffix, uses for pods' FQDN +## Ref: https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/ +## +clusterDomainSuffix: cluster.local + +vmselect: + enabled: true + name: vmselect + image: + repository: valyala/vmselect + tag: heads-cluster-0-gca0d4847 + pullPolicy: IfNotPresent + priorityClassName: "" + extraArgs: {} + + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule" + + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + podAnnotations: + prometheus.io/scrape: "true" + replicaCount: 2 + resources: {} + # limits: + # cpu: 50m + # memory: 64Mi + # requests: + # cpu: 50m + # memory: 64Mi + securityContext: {} + ## Root folder for cache + ## + cacheMountPath: /cache + service: + annotations: {} + labels: {} + clusterIP: "" + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 8481 + type: ClusterIP + +vminsert: + enabled: true + name: vminsert + image: + repository: valyala/vminsert + tag: heads-cluster-0-gca0d4847 + pullPolicy: IfNotPresent + priorityClassName: "" + extraArgs: {} + + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule" + + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + podAnnotations: + prometheus.io/scrape: "true" + replicaCount: 2 + resources: {} + # limits: + # cpu: 50m + # memory: 64Mi + # requests: + # cpu: 50m + # memory: 64Mi + securityContext: {} + service: + annotations: {} + labels: {} + clusterIP: "" + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 8480 + type: ClusterIP + +vmstorage: + enabled: true + name: vmstorage + image: + repository: valyala/vmstorage + tag: heads-cluster-0-gca0d4847 + pullPolicy: IfNotPresent + priorityClassName: "" + fullnameOverride: + ## Data retention period in month + ## + retentionPeriod: 1 + ## Additional vmstorage container arguments + ## + extraArgs: {} + + ## Additional vmstorage hostPath mounts + ## + extraHostPathMounts: [] + # - name: certs-dir + # mountPath: /etc/kubernetes/certs + # subPath: "" + # hostPath: /etc/kubernetes/certs + # readOnly: true + + extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /certs + # subPath: "" + # configMap: certs-configmap + # readOnly: true + + ## Additional Vmstorage Secret mounts + # Defines additional mounts with secrets. Secrets must be manually created in the namespace. + extraSecretMounts: [] + # - name: secret-files + # mountPath: /etc/secrets + # subPath: "" + # secretName: secret-files + # readOnly: true + + ## Node tolerations for server scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule" + + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Pod affinity + ## + affinity: {} + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: + + persistentVolume: + ## If true, vmstorage will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: true + + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + annotations: {} + + ## Requires vmstorage.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "" + + ## Vmstorage data Persistent Volume mount root path + ## + mountPath: /storage + size: 8Gi + subPath: "" + + + podAnnotations: { + prometheus.io/scrape: "true" + } + replicaCount: 2 + podManagementPolicy: OrderedReady + + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 500m + # memory: 512Mi + # requests: + # cpu: 500m + # memory: 512Mi + + ## Security context to be added to server pods + ## + securityContext: {} + service: + annotations: {} + labels: {} + servicePort: 8482 + vmselectPort: 8400 + vminsertPort: 8401 + terminationGracePeriodSeconds: 60 diff --git a/go.mod b/go.mod index cc0a71ebb..2272042bf 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ require ( github.com/VictoriaMetrics/metrics v1.4.0 github.com/cespare/xxhash/v2 v2.0.1-0.20190104013014-3767db7a7e18 github.com/golang/snappy v0.0.1 + github.com/lithammer/go-jump-consistent-hash v1.0.0 github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/valyala/fastjson v1.4.1 github.com/valyala/gozstd v1.5.0 diff --git a/go.sum b/go.sum index 934851e73..8a6b282ed 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,8 @@ github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= +github.com/lithammer/go-jump-consistent-hash v1.0.0 h1:TmRnbmkUcGJzfiCXhy/D1FFtGLYEQfGWawHffhsTevI= +github.com/lithammer/go-jump-consistent-hash v1.0.0/go.mod h1:Snz99O1UkmvgsOV76Jm7Zu4sokENziqvUCbPztFABIU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= diff --git a/lib/auth/auth.go b/lib/auth/auth.go new file mode 100644 index 000000000..7056eb0b4 --- /dev/null +++ b/lib/auth/auth.go @@ -0,0 +1,35 @@ +package auth + +import ( + "fmt" + "strconv" + "strings" +) + +// Token contains settings for request processing +type Token struct { + ProjectID uint32 + AccountID uint32 +} + +// NewToken returns new Token for the given authToken +func NewToken(authToken string) (*Token, error) { + tmp := strings.Split(authToken, ":") + if len(tmp) > 2 { + return nil, fmt.Errorf("unexpected number of items in authToken %q; got %d; want 1 or 2", authToken, len(tmp)) + } + var at Token + accountID, err := strconv.Atoi(tmp[0]) + if err != nil { + return nil, fmt.Errorf("cannot parse accountID from %q: %s", tmp[0], err) + } + at.AccountID = uint32(accountID) + if len(tmp) > 1 { + projectID, err := strconv.Atoi(tmp[1]) + if err != nil { + return nil, fmt.Errorf("cannot parse projectID from %q: %s", tmp[1], err) + } + at.ProjectID = uint32(projectID) + } + return &at, nil +} diff --git a/lib/consts/consts.go b/lib/consts/consts.go new file mode 100644 index 000000000..3b16e56f9 --- /dev/null +++ b/lib/consts/consts.go @@ -0,0 +1,4 @@ +package consts + +// MaxInsertPacketSize is the maximum packet size in bytes vminsert may send to vmstorage. +const MaxInsertPacketSize = 100 * 1024 * 1024 diff --git a/lib/handshake/buffered_conn.go b/lib/handshake/buffered_conn.go new file mode 100644 index 000000000..774adf036 --- /dev/null +++ b/lib/handshake/buffered_conn.go @@ -0,0 +1,80 @@ +package handshake + +import ( + "bufio" + "io" + "net" + + "github.com/valyala/gozstd" +) + +type bufferedWriter interface { + Write(p []byte) (int, error) + Flush() error +} + +// BufferedConn is a net.Conn with Flush suport. +type BufferedConn struct { + net.Conn + + br io.Reader + bw bufferedWriter +} + +const bufferSize = 64 * 1024 + +// newBufferedConn returns buffered connection with the given compression level. +func newBufferedConn(c net.Conn, compressionLevel int, isReadCompressed bool) *BufferedConn { + bc := &BufferedConn{ + Conn: c, + } + if compressionLevel <= 0 { + bc.bw = bufio.NewWriterSize(c, bufferSize) + } else { + bc.bw = gozstd.NewWriterLevel(c, compressionLevel) + } + if !isReadCompressed { + bc.br = bufio.NewReaderSize(c, bufferSize) + } else { + bc.br = gozstd.NewReader(c) + } + return bc +} + +// Read reads up to len(p) from bc to p. +func (bc *BufferedConn) Read(p []byte) (int, error) { + return bc.br.Read(p) +} + +// Write writes p to bc. +// +// Do not forget to call Flush if needed. +func (bc *BufferedConn) Write(p []byte) (int, error) { + return bc.bw.Write(p) +} + +// Close closes bc. +func (bc *BufferedConn) Close() error { + // Close the Conn at first. It is expected that all the required data + // is already flushed to the Conn. + err := bc.Conn.Close() + bc.Conn = nil + + if zr, ok := bc.br.(*gozstd.Reader); ok { + zr.Release() + } + bc.br = nil + + if zw, ok := bc.bw.(*gozstd.Writer); ok { + // Do not call zw.Close(), since we already closed the underlying conn. + zw.Release() + } + bc.bw = nil + + return err +} + +// Flush flushes internal write buffers to the underlying conn. +func (bc *BufferedConn) Flush() error { + return bc.bw.Flush() +} diff --git a/lib/handshake/handshake.go b/lib/handshake/handshake.go new file mode 100644 index 000000000..0a71bdd1a --- /dev/null +++ b/lib/handshake/handshake.go @@ -0,0 +1,170 @@ +package handshake + +import ( + "fmt" + "io" + "net" + "time" +) + +const ( + vminsertHello = "vminsert.01" + vmselectHello = "vmselect.01" + + successResponse = "ok" +) + +// Func must perform handshake on the given c using the given compressionLevel. +// +// It must return BufferedConn wrapper for c on successful handshake. +type Func func(c net.Conn, compressionLevel int) (*BufferedConn, error) + +// VMInsertClient performs client-side handshake for vminsert protocol. +// +// compressionLevel is the level used for compression of the data sent +// to the server. +// compressionLevel <= 0 means 'no compression' +func VMInsertClient(c net.Conn, compressionLevel int) (*BufferedConn, error) { + return genericClient(c, vminsertHello, compressionLevel) +} + +// VMInsertServer performs server-side handshake for vminsert protocol. +// +// compressionLevel is the level used for compression of the data sent +// to the client. +// compressionLevel <= 0 means 'no compression' +func VMInsertServer(c net.Conn, compressionLevel int) (*BufferedConn, error) { + return genericServer(c, vminsertHello, compressionLevel) +} + +// VMSelectClient performs client-side handshake for vmselect protocol. +// +// compressionLevel is the level used for compression of the data sent +// to the server. +// compressionLevel <= 0 means 'no compression' +func VMSelectClient(c net.Conn, compressionLevel int) (*BufferedConn, error) { + return genericClient(c, vmselectHello, compressionLevel) +} + +// VMSelectServer performs server-side handshake for vmselect protocol. +// +// compressionLevel is the level used for compression of the data sent +// to the client. +// compressionLevel <= 0 means 'no compression' +func VMSelectServer(c net.Conn, compressionLevel int) (*BufferedConn, error) { + return genericServer(c, vmselectHello, compressionLevel) +} + +func genericServer(c net.Conn, msg string, compressionLevel int) (*BufferedConn, error) { + if err := readMessage(c, msg); err != nil { + return nil, fmt.Errorf("cannot read hello: %s", err) + } + if err := writeMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot write success response on hello: %s", err) + } + isRemoteCompressed, err := readIsCompressed(c) + if err != nil { + return nil, fmt.Errorf("cannot read isCompressed flag: %s", err) + } + if err := writeMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot write success response on isCompressed: %s", err) + } + if err := writeIsCompressed(c, compressionLevel > 0); err != nil { + return nil, fmt.Errorf("cannot write isCompressed flag: %s", err) + } + if err := readMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot read success response on isCompressed: %s", err) + } + bc := newBufferedConn(c, compressionLevel, isRemoteCompressed) + return bc, nil +} + +func genericClient(c net.Conn, msg string, compressionLevel int) (*BufferedConn, error) { + if err := writeMessage(c, msg); err != nil { + return nil, fmt.Errorf("cannot write hello: %s", err) + } + if err := readMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot read success response after sending hello: %s", err) + } + if err := writeIsCompressed(c, compressionLevel > 0); err != nil { + return nil, fmt.Errorf("cannot write isCompressed flag: %s", err) + } + if err := readMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot read success response on isCompressed: %s", err) + } + isRemoteCompressed, err := readIsCompressed(c) + if err != nil { + return nil, fmt.Errorf("cannot read isCompressed flag: %s", err) + } + if err := writeMessage(c, successResponse); err != nil { + return nil, fmt.Errorf("cannot write success response on isCompressed: %s", err) + } + bc := newBufferedConn(c, compressionLevel, isRemoteCompressed) + return bc, nil +} + +func writeIsCompressed(c net.Conn, isCompressed bool) error { + var buf [1]byte + if isCompressed { + buf[0] = 1 + } + return writeMessage(c, string(buf[:])) +} + +func readIsCompressed(c net.Conn) (bool, error) { + buf, err := readData(c, 1) + if err != nil { + return false, err + } + isCompressed := (buf[0] != 0) + return isCompressed, nil +} + +func writeMessage(c net.Conn, msg string) error { + if err := c.SetWriteDeadline(time.Now().Add(time.Second)); err != nil { + return fmt.Errorf("cannot set write deadline: %s", err) + } + if _, err := io.WriteString(c, msg); err != nil { + return fmt.Errorf("cannot write %q to server: %s", msg, err) + } + if fc, ok := c.(flusher); ok { + if err := fc.Flush(); err != nil { + return fmt.Errorf("cannot flush %q to server: %s", msg, err) + } + } + if err := c.SetWriteDeadline(zeroTime); err != nil { + return fmt.Errorf("cannot reset write deadline: %s", err) + } + return nil +} + +type flusher interface { + Flush() error +} + +func readMessage(c net.Conn, msg string) error { + buf, err := readData(c, len(msg)) + if err != nil { + return err + } + if string(buf) != msg { + return fmt.Errorf("unexpected message obtained; got %q; want %q", buf, msg) + } + return nil +} + +func readData(c net.Conn, dataLen int) ([]byte, error) { + if err := c.SetReadDeadline(time.Now().Add(time.Second)); err != nil { + return nil, fmt.Errorf("cannot set read deadline: %s", err) + } + data := make([]byte, dataLen) + if _, err := io.ReadFull(c, data); err != nil { + return nil, fmt.Errorf("cannot read message with size %d: %s", dataLen, err) + } + if err := c.SetReadDeadline(zeroTime); err != nil { + return nil, fmt.Errorf("cannot reset read deadline: %s", err) + } + return data, nil +} + +var zeroTime time.Time diff --git a/lib/handshake/handshake_test.go b/lib/handshake/handshake_test.go new file mode 100644 index 000000000..4d4d9b229 --- /dev/null +++ b/lib/handshake/handshake_test.go @@ -0,0 +1,61 @@ +package handshake + +import ( + "fmt" + "net" + "testing" + "time" +) + +func TestVMInsertHandshake(t *testing.T) { + testHandshake(t, VMInsertClient, VMInsertServer) +} + +func TestVMSelectHandshake(t *testing.T) { + testHandshake(t, VMSelectClient, VMSelectServer) +} + +func testHandshake(t *testing.T, clientFunc, serverFunc Func) { + t.Helper() + + c, s := net.Pipe() + ch := make(chan error, 1) + go func() { + bcs, err := serverFunc(s, 3) + if err != nil { + ch <- fmt.Errorf("error on outer handshake: %s", err) + return + } + bcc, err := clientFunc(bcs, 3) + if err != nil { + ch <- fmt.Errorf("error on inner handshake: %s", err) + return + } + if bcc == nil { + ch <- fmt.Errorf("expecting non-nil conn") + return + } + ch <- nil + }() + + bcc, err := clientFunc(c, 0) + if err != nil { + t.Fatalf("error on outer handshake: %s", err) + } + bcs, err := serverFunc(bcc, 0) + if err != nil { + t.Fatalf("error on inner handshake: %s", err) + } + if bcs == nil { + t.Fatalf("expecting non-nil conn") + } + + select { + case <-time.After(5 * time.Second): + t.Fatalf("timeout") + case err := <-ch: + if err != nil { + t.Fatalf("unexpected error on the server side: %s", err) + } + } +} diff --git a/lib/httpserver/httpserver.go b/lib/httpserver/httpserver.go index c03ac83d3..d8da38f88 100644 --- a/lib/httpserver/httpserver.go +++ b/lib/httpserver/httpserver.go @@ -5,7 +5,6 @@ import ( "compress/gzip" "context" "crypto/tls" - "flag" "fmt" "io" "net" @@ -22,17 +21,6 @@ import ( "github.com/VictoriaMetrics/metrics" ) -var ( - tlsEnable = flag.Bool("tls", false, "Whether to enable TLS (aka HTTPS) for incoming requests. tlsCertFile and tlsKeyFile must be set if tls=true") - tlsCertFile = flag.String("tlsCertFile", "", "Path to file with TLS certificate. Used only if tls=true. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow") - tlsKeyFile = flag.String("tlsKeyFile", "", "Path to file with TLS key. Used only if tls=true") - - httpAuthUsername = flag.String("httpAuth.username", "", "Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password") - httpAuthPassword = flag.String("httpAuth.password", "", "Password for HTTP Basic Auth. The authentication is disabled -httpAuth.username is empty") - metricsAuthKey = flag.String("metricsAuthKey", "", "Auth key for /metrics. It overrides httpAuth settings") - pprofAuthKey = flag.String("pprofAuthKey", "", "Auth key for /debug/pprof. It overrides httpAuth settings") -) - var ( servers = make(map[string]*http.Server) serversLock sync.Mutex @@ -52,29 +40,13 @@ type RequestHandler func(w http.ResponseWriter, r *http.Request) bool // charges a lot for the egress traffic. The compression may be disabled // by calling DisableResponseCompression before writing the first byte to w. func Serve(addr string, rh RequestHandler) { - scheme := "http" - if *tlsEnable { - scheme = "https" - } - logger.Infof("starting http server at %s://%s/", scheme, addr) - logger.Infof("pprof handlers are exposed at %s://%s/debug/pprof/", scheme, addr) - lnTmp, err := netutil.NewTCPListener(scheme, addr) + logger.Infof("starting http server at http://%s/", addr) + logger.Infof("pprof handlers are exposed at http://%s/debug/pprof/", addr) + ln, err := netutil.NewTCPListener("http", addr) if err != nil { - logger.Fatalf("cannot start http server at %s: %s", addr, err) - } - setNetworkTimeouts(lnTmp) - ln := net.Listener(lnTmp) - - if *tlsEnable { - cert, err := tls.LoadX509KeyPair(*tlsCertFile, *tlsKeyFile) - if err != nil { - logger.Fatalf("cannot load TLS cert from tlsCertFile=%q, tlsKeyFile=%q: %s", *tlsCertFile, *tlsKeyFile, err) - } - cfg := &tls.Config{ - Certificates: []tls.Certificate{cert}, - } - ln = tls.NewListener(ln, cfg) + logger.Panicf("FATAL: cannot start http server at %s: %s", addr, err) } + setNetworkTimeouts(ln) serveWithListener(addr, ln, rh) } @@ -151,9 +123,6 @@ var metricsHandlerDuration = metrics.NewSummary(`vm_http_request_duration_second func handlerWrapper(w http.ResponseWriter, r *http.Request, rh RequestHandler) { requestsTotal.Inc() - if !checkAuth(w, r) { - return - } switch r.URL.Path { case "/health": w.Header().Set("Content-Type", "text/plain") @@ -177,7 +146,6 @@ func handlerWrapper(w http.ResponseWriter, r *http.Request, rh RequestHandler) { pprofHandler(r.URL.Path[len("/debug/pprof/"):], w, r) return } - if rh(w, r) { return } @@ -188,41 +156,6 @@ func handlerWrapper(w http.ResponseWriter, r *http.Request, rh RequestHandler) { } } -func checkAuth(w http.ResponseWriter, r *http.Request) bool { - path := r.URL.Path - if path == "/metrics" && len(*metricsAuthKey) > 0 { - authKey := r.FormValue("authKey") - if *metricsAuthKey == authKey { - return true - } - http.Error(w, "The provided authKey doesn't match -metricsAuthKey", http.StatusUnauthorized) - return false - } - if strings.HasPrefix(path, "/debug/pprof/") && len(*pprofAuthKey) > 0 { - authKey := r.FormValue("authKey") - if *pprofAuthKey == authKey { - return true - } - http.Error(w, "The provided authKey doesn't match -pprofAuthKey", http.StatusUnauthorized) - return false - } - return checkBasicAuth(w, r) -} - -func checkBasicAuth(w http.ResponseWriter, r *http.Request) bool { - if len(*httpAuthUsername) == 0 { - // HTTP Basic Auth is disabled. - return true - } - username, password, ok := r.BasicAuth() - if ok && username == *httpAuthUsername && password == *httpAuthPassword { - return true - } - w.Header().Set("WWW-Authenticate", `Basic realm="VictoriaMetrics"`) - http.Error(w, "", http.StatusUnauthorized) - return false -} - func maybeGzipResponseWriter(w http.ResponseWriter, r *http.Request) http.ResponseWriter { ae := r.Header.Get("Accept-Encoding") if ae == "" { diff --git a/lib/httpserver/path.go b/lib/httpserver/path.go new file mode 100644 index 000000000..833c0a6b4 --- /dev/null +++ b/lib/httpserver/path.go @@ -0,0 +1,64 @@ +package httpserver + +import ( + "fmt" + "strings" +) + +// Path contains the following path structure: +// /{prefix}/{authToken}/{suffix} +// +// It is compatible with SaaS version. +type Path struct { + Prefix string + AuthToken string + Suffix string +} + +// ParsePath parses the given path. +func ParsePath(path string) (*Path, error) { + // The path must have the following form: + // /{prefix}/{authToken}/{suffix} + // + // - prefix must contain `select`, `insert` or `delete`. + // - authToken contains `accountID[:projectID]`, where projectID is optional. + // - suffix contains arbitrary suffix. + // + // prefix must be used for the routing to the appropriate service + // in the cluster - either vminsert or vmselect. + s := skipPrefixSlashes(path) + n := strings.IndexByte(s, '/') + if n < 0 { + return nil, fmt.Errorf("cannot find {prefix}") + } + prefix := s[:n] + + s = skipPrefixSlashes(s[n+1:]) + n = strings.IndexByte(s, '/') + if n < 0 { + return nil, fmt.Errorf("cannot find {authToken}") + } + authToken := s[:n] + + s = skipPrefixSlashes(s[n+1:]) + + // Substitute double slashes with single slashes in the path, since such slashes + // may appear due improper copy-pasting of the url. + suffix := strings.Replace(s, "//", "/", -1) + + p := &Path{ + Prefix: prefix, + AuthToken: authToken, + Suffix: suffix, + } + return p, nil +} + +// skipPrefixSlashes remove double slashes which may appear due +// improper copy-pasting of the url +func skipPrefixSlashes(s string) string { + for len(s) > 0 && s[0] == '/' { + s = s[1:] + } + return s +} diff --git a/lib/netutil/conn_pool.go b/lib/netutil/conn_pool.go new file mode 100644 index 000000000..aa3c63e13 --- /dev/null +++ b/lib/netutil/conn_pool.go @@ -0,0 +1,76 @@ +package netutil + +import ( + "fmt" + "sync" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake" +) + +// ConnPool is a connection pool with ZSTD-compressed connections. +type ConnPool struct { + mu sync.Mutex + d *TCPDialer + + name string + handshakeFunc handshake.Func + compressionLevel int + + conns []*handshake.BufferedConn +} + +// NewConnPool creates a new connection pool for the given addr. +// +// Name is used in exported metrics. +// handshakeFunc is used for handshaking after the connection establishing. +// The compression is disabled if compressionLevel <= 0. +func NewConnPool(name, addr string, handshakeFunc handshake.Func, compressionLevel int) *ConnPool { + return &ConnPool{ + d: NewTCPDialer(name, addr), + + name: name, + handshakeFunc: handshakeFunc, + compressionLevel: compressionLevel, + } +} + +// Addr returns the address where connections are established. +func (cp *ConnPool) Addr() string { + return cp.d.addr +} + +// Get returns free connection from the pool. +func (cp *ConnPool) Get() (*handshake.BufferedConn, error) { + var bc *handshake.BufferedConn + cp.mu.Lock() + if len(cp.conns) > 0 { + bc = cp.conns[len(cp.conns)-1] + cp.conns[len(cp.conns)-1] = nil + cp.conns = cp.conns[:len(cp.conns)-1] + } + cp.mu.Unlock() + if bc != nil { + return bc, nil + } + + // Pool is empty. Create new connection. + c, err := cp.d.Dial() + if err != nil { + return nil, fmt.Errorf("cannot dial %s: %s", cp.d.Addr(), err) + } + if bc, err = cp.handshakeFunc(c, cp.compressionLevel); err != nil { + err = fmt.Errorf("cannot perform %q handshake with server %q: %s", cp.name, cp.d.Addr(), err) + _ = c.Close() + return nil, err + } + return bc, nil +} + +// Put puts bc back to the pool. +// +// Do not put broken and closed connections to the pool! +func (cp *ConnPool) Put(bc *handshake.BufferedConn) { + cp.mu.Lock() + cp.conns = append(cp.conns, bc) + cp.mu.Unlock() +} diff --git a/lib/netutil/tcpdialer.go b/lib/netutil/tcpdialer.go new file mode 100644 index 000000000..ad3bfdc97 --- /dev/null +++ b/lib/netutil/tcpdialer.go @@ -0,0 +1,64 @@ +package netutil + +import ( + "fmt" + "net" + "time" + + "github.com/VictoriaMetrics/metrics" +) + +// NewTCPDialer returns new dialer for dialing the given addr. +// +// The name is used in metric tags for the returned dialer. +// The name must be unique among dialers. +func NewTCPDialer(name, addr string) *TCPDialer { + d := &TCPDialer{ + d: &net.Dialer{ + Timeout: time.Second, + KeepAlive: time.Second, + }, + + addr: addr, + + dials: metrics.NewCounter(fmt.Sprintf(`vm_tcpdialer_dials_total{name=%q, addr=%q}`, name, addr)), + dialErrors: metrics.NewCounter(fmt.Sprintf(`vm_tcpdialer_errors_total{name=%q, addr=%q, type="dial"}`, name, addr)), + } + d.connMetrics.init("vm_tcpdialer", name, addr) + return d +} + +// TCPDialer is used for dialing the addr passed to NewTCPDialer. +// +// It also gathers various stats for dialed connections. +type TCPDialer struct { + d *net.Dialer + + addr string + + dials *metrics.Counter + dialErrors *metrics.Counter + + connMetrics +} + +// Dial dials the addr passed to NewTCPDialer. +func (d *TCPDialer) Dial() (net.Conn, error) { + d.dials.Inc() + c, err := d.d.Dial("tcp4", d.addr) + if err != nil { + d.dialErrors.Inc() + return nil, err + } + d.conns.Inc() + sc := &statConn{ + Conn: c, + cm: &d.connMetrics, + } + return sc, err +} + +// Addr returns the address the dialer dials to. +func (d *TCPDialer) Addr() string { + return d.addr +} diff --git a/lib/storage/block_header_test.go b/lib/storage/block_header_test.go index ea5cdd669..cba1b992d 100644 --- a/lib/storage/block_header_test.go +++ b/lib/storage/block_header_test.go @@ -11,7 +11,7 @@ func TestMarshaledBlockHeaderSize(t *testing.T) { // This test makes sure marshaled format isn't changed. // If this test breaks then the storage format has been changed, // so it may become incompatible with the previously written data. - expectedSize := 81 + expectedSize := 89 if marshaledBlockHeaderSize != expectedSize { t.Fatalf("unexpected marshaledBlockHeaderSize; got %d; want %d", marshaledBlockHeaderSize, expectedSize) } diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index c1150a1e1..967539cc3 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -60,6 +60,9 @@ type indexDB struct { // Cache for fast MetricID -> MetricName lookup. metricNameCache *fastcache.Cache + tagCachePrefixes map[accountProjectKey]uint64 + tagCachePrefixesLock sync.RWMutex + indexSearchPool sync.Pool // An inmemory map[uint64]struct{} of deleted metricIDs. @@ -78,6 +81,12 @@ type indexDB struct { mustDrop uint64 } +// accountProjectKey is used for maps keyed by (AccountID, ProjectID). +type accountProjectKey struct { + AccountID uint32 + ProjectID uint32 +} + // openIndexDB opens index db from the given path with the given caches. func openIndexDB(path string, metricIDCache, metricNameCache *fastcache.Cache) (*indexDB, error) { tb, err := mergeset.OpenTable(path) @@ -99,6 +108,8 @@ func openIndexDB(path string, metricIDCache, metricNameCache *fastcache.Cache) ( tagCache: tagCache, metricIDCache: metricIDCache, metricNameCache: metricNameCache, + + tagCachePrefixes: make(map[accountProjectKey]uint64), } is := db.getIndexSearch() @@ -240,6 +251,10 @@ func (db *indexDB) putToTagCache(tsids []TSID, key []byte) { } func (db *indexDB) getFromMetricIDCache(dst *TSID, metricID uint64) error { + // There is no need in prefixing the key with (accountID, projectID), + // since metricID is globally unique across all (accountID, projectID) values. + // See getUniqueUint64. + // There is no need in checking for deleted metricIDs here, since they // must be checked by the caller. buf := (*[unsafe.Sizeof(*dst)]byte)(unsafe.Pointer(dst)) @@ -262,6 +277,10 @@ func (db *indexDB) putToMetricIDCache(metricID uint64, tsid *TSID) { } func (db *indexDB) getMetricNameFromCache(dst []byte, metricID uint64) []byte { + // There is no need in prefixing the key with (accountID, projectID), + // since metricID is globally unique across all (accountID, projectID) values. + // See getUniqueUint64. + // There is no need in checking for deleted metricIDs here, since they // must be checked by the caller. key := (*[unsafe.Sizeof(metricID)]byte)(unsafe.Pointer(&metricID)) @@ -273,13 +292,28 @@ func (db *indexDB) putMetricNameToCache(metricID uint64, metricName []byte) { db.metricNameCache.Set(key[:], metricName) } -func marshalTagFiltersKey(dst []byte, tfss []*TagFilters) []byte { - prefix := atomic.LoadUint64(&tagFiltersKeyGen) +func (db *indexDB) marshalTagFiltersKey(dst []byte, tfss []*TagFilters) []byte { + if len(tfss) == 0 { + return nil + } + k := accountProjectKey{ + AccountID: tfss[0].accountID, + ProjectID: tfss[0].projectID, + } + db.tagCachePrefixesLock.RLock() + prefix := db.tagCachePrefixes[k] + db.tagCachePrefixesLock.RUnlock() + if prefix == 0 { + // Create missing prefix. + // It is if multiple concurrent goroutines call invalidateTagCache + // for the same (accountID, projectID). + prefix = db.invalidateTagCache(k.AccountID, k.ProjectID) + } dst = encoding.MarshalUint64(dst, prefix) for _, tfs := range tfss { dst = append(dst, 0) // separator between tfs groups. for i := range tfs.tfs { - dst = tfs.tfs[i].Marshal(dst) + dst = tfs.tfs[i].MarshalNoAccountIDProjectID(dst) } } return dst @@ -317,13 +351,21 @@ func unmarshalTSIDs(dst []TSID, src []byte) ([]TSID, error) { return dst, nil } -func (db *indexDB) invalidateTagCache() { +func (db *indexDB) invalidateTagCache(accountID, projectID uint32) uint64 { // This function must be fast, since it is called each // time new timeseries is added. - atomic.AddUint64(&tagFiltersKeyGen, 1) + prefix := atomic.AddUint64(&tagCacheKeyPrefix, 1) + k := accountProjectKey{ + AccountID: accountID, + ProjectID: projectID, + } + db.tagCachePrefixesLock.Lock() + db.tagCachePrefixes[k] = prefix + db.tagCachePrefixesLock.Unlock() + return prefix } -var tagFiltersKeyGen uint64 +var tagCacheKeyPrefix uint64 // getTSIDByNameNoCreate fills the dst with TSID for the given metricName. // @@ -425,8 +467,9 @@ func (db *indexDB) createTSIDByName(dst *TSID, metricName []byte) error { return fmt.Errorf("cannot create indexes: %s", err) } - // Invalidate tag cache, since it doesn't contain tags for the created mn -> TSID mapping. - db.invalidateTagCache() + // Invalidate tag cache for the given (AccountID, ProjectID), since + // it doesn't contain tags for the created mn -> TSID mapping. + _ = db.invalidateTagCache(mn.AccountID, mn.ProjectID) return nil } @@ -449,6 +492,8 @@ func (db *indexDB) generateTSID(dst *TSID, metricName []byte, mn *MetricName) er // The TSID wan't found in the external storage. // Generate it locally. + dst.AccountID = mn.AccountID + dst.ProjectID = mn.ProjectID dst.MetricGroupID = xxhash.Sum64(mn.MetricGroup) if len(mn.Tags) > 0 { dst.JobID = uint32(xxhash.Sum64(mn.Tags[0].Value)) @@ -474,19 +519,19 @@ func (db *indexDB) createIndexes(tsid *TSID, mn *MetricName) error { items.Next() // Create MetricID -> MetricName index. - items.B = marshalCommonPrefix(items.B, nsPrefixMetricIDToMetricName) + items.B = marshalCommonPrefix(items.B, nsPrefixMetricIDToMetricName, mn.AccountID, mn.ProjectID) items.B = encoding.MarshalUint64(items.B, tsid.MetricID) items.B = mn.Marshal(items.B) items.Next() // Create MetricID -> TSID index. - items.B = marshalCommonPrefix(items.B, nsPrefixMetricIDToTSID) + items.B = marshalCommonPrefix(items.B, nsPrefixMetricIDToTSID, mn.AccountID, mn.ProjectID) items.B = encoding.MarshalUint64(items.B, tsid.MetricID) items.B = tsid.Marshal(items.B) items.Next() commonPrefix := kbPool.Get() - commonPrefix.B = marshalCommonPrefix(commonPrefix.B[:0], nsPrefixTagToMetricID) + commonPrefix.B = marshalCommonPrefix(commonPrefix.B[:0], nsPrefixTagToMetricID, mn.AccountID, mn.ProjectID) // Create MetricGroup -> MetricID index. items.B = append(items.B, commonPrefix.B...) @@ -543,14 +588,14 @@ func putIndexItems(ii *indexItems) { var indexItemsPool sync.Pool -// SearchTagKeys returns all the tag keys. -func (db *indexDB) SearchTagKeys(maxTagKeys int) ([]string, error) { +// SearchTagKeys returns all the tag keys for the given accountID, projectID. +func (db *indexDB) SearchTagKeys(accountID, projectID uint32, maxTagKeys int) ([]string, error) { // TODO: cache results? tks := make(map[string]struct{}) is := db.getIndexSearch() - err := is.searchTagKeys(tks, maxTagKeys) + err := is.searchTagKeys(accountID, projectID, tks, maxTagKeys) db.putIndexSearch(is) if err != nil { return nil, err @@ -558,7 +603,7 @@ func (db *indexDB) SearchTagKeys(maxTagKeys int) ([]string, error) { ok := db.doExtDB(func(extDB *indexDB) { is := extDB.getIndexSearch() - err = is.searchTagKeys(tks, maxTagKeys) + err = is.searchTagKeys(accountID, projectID, tks, maxTagKeys) extDB.putIndexSearch(is) }) if ok && err != nil { @@ -574,11 +619,11 @@ func (db *indexDB) SearchTagKeys(maxTagKeys int) ([]string, error) { return keys, nil } -func (is *indexSearch) searchTagKeys(tks map[string]struct{}, maxTagKeys int) error { +func (is *indexSearch) searchTagKeys(accountID, projectID uint32, tks map[string]struct{}, maxTagKeys int) error { ts := &is.ts kb := &is.kb dmis := is.db.getDeletedMetricIDs() - commonPrefix := marshalCommonPrefix(nil, nsPrefixTagToMetricID) + commonPrefix := marshalCommonPrefix(nil, nsPrefixTagToMetricID, accountID, projectID) ts.Seek(commonPrefix) for len(tks) < maxTagKeys && ts.NextItem() { item := ts.Item @@ -626,11 +671,11 @@ func (is *indexSearch) searchTagKeys(tks map[string]struct{}, maxTagKeys int) er } // SearchTagValues returns all the tag values for the given tagKey -func (db *indexDB) SearchTagValues(tagKey []byte, maxTagValues int) ([]string, error) { +func (db *indexDB) SearchTagValues(accountID, projectID uint32, tagKey []byte, maxTagValues int) ([]string, error) { // TODO: cache results? kb := kbPool.Get() - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID, accountID, projectID) kb.B = marshalTagValue(kb.B, tagKey) tvs := make(map[string]struct{}) @@ -712,13 +757,13 @@ func (is *indexSearch) searchTagValues(tvs map[string]struct{}, prefix []byte, m return nil } -// GetSeriesCount returns the approximate number of unique timeseries in the db. +// GetSeriesCount returns the approximate number of unique timeseries for the given (accountID, projectID). // // It includes the deleted series too and may count the same series // up to two times - in db and extDB. -func (db *indexDB) GetSeriesCount() (uint64, error) { +func (db *indexDB) GetSeriesCount(accountID, projectID uint32) (uint64, error) { is := db.getIndexSearch() - n, err := getSeriesCount(&is.ts, &is.kb) + n, err := getSeriesCount(accountID, projectID, &is.ts, &is.kb) db.putIndexSearch(is) if err != nil { return 0, err @@ -727,7 +772,7 @@ func (db *indexDB) GetSeriesCount() (uint64, error) { var nExt uint64 ok := db.doExtDB(func(extDB *indexDB) { is := extDB.getIndexSearch() - nExt, err = getSeriesCount(&is.ts, &is.kb) + nExt, err = getSeriesCount(accountID, projectID, &is.ts, &is.kb) extDB.putIndexSearch(is) }) if ok && err != nil { @@ -738,9 +783,9 @@ func (db *indexDB) GetSeriesCount() (uint64, error) { // searchMetricName appends metric name for the given metricID to dst // and returns the result. -func (db *indexDB) searchMetricName(dst []byte, metricID uint64) ([]byte, error) { +func (db *indexDB) searchMetricName(dst []byte, metricID uint64, accountID, projectID uint32) ([]byte, error) { is := db.getIndexSearch() - dst, err := is.searchMetricName(dst, metricID) + dst, err := is.searchMetricName(dst, metricID, accountID, projectID) db.putIndexSearch(is) if err != io.EOF { @@ -750,7 +795,7 @@ func (db *indexDB) searchMetricName(dst []byte, metricID uint64) ([]byte, error) // Try searching in the external indexDB. if db.doExtDB(func(extDB *indexDB) { is := extDB.getIndexSearch() - dst, err = is.searchMetricName(dst, metricID) + dst, err = is.searchMetricName(dst, metricID, accountID, projectID) extDB.putIndexSearch(is) }) { return dst, err @@ -771,6 +816,8 @@ func (db *indexDB) DeleteTSIDs(tfss []*TagFilters) (int, error) { if len(tfss) == 0 { return 0, nil } + accountID := tfss[0].accountID + projectID := tfss[0].projectID // Obtain metricIDs to delete. is := db.getIndexSearch() @@ -802,7 +849,7 @@ func (db *indexDB) DeleteTSIDs(tfss []*TagFilters) (int, error) { db.updateDeletedMetricIDs(metricIDs) // Reset TagFilters -> TSIDS cache, since it may contain deleted TSIDs. - db.invalidateTagCache() + _ = db.invalidateTagCache(accountID, projectID) // Delete TSIDs in the extDB. if db.doExtDB(func(extDB *indexDB) { @@ -872,7 +919,7 @@ func (db *indexDB) searchTSIDs(tfss []*TagFilters, tr TimeRange, maxMetrics int) tfKeyBuf := tagFiltersKeyBufPool.Get() defer tagFiltersKeyBufPool.Put(tfKeyBuf) - tfKeyBuf.B = marshalTagFiltersKey(tfKeyBuf.B[:0], tfss) + tfKeyBuf.B = db.marshalTagFiltersKey(tfKeyBuf.B[:0], tfss) tsids, ok := db.getFromTagCache(tfKeyBuf.B) if ok { // Fast path - tsids found in the cache. @@ -959,7 +1006,7 @@ func (is *indexSearch) getTSIDByMetricName(dst *TSID, metricName []byte) error { return io.EOF } -func (is *indexSearch) searchMetricName(dst []byte, metricID uint64) ([]byte, error) { +func (is *indexSearch) searchMetricName(dst []byte, metricID uint64, accountID, projectID uint32) ([]byte, error) { metricName := is.db.getMetricNameFromCache(dst, metricID) if len(metricName) > len(dst) { return metricName, nil @@ -967,7 +1014,7 @@ func (is *indexSearch) searchMetricName(dst []byte, metricID uint64) ([]byte, er ts := &is.ts kb := &is.kb - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToMetricName) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToMetricName, accountID, projectID) kb.B = encoding.MarshalUint64(kb.B, metricID) if err := ts.FirstItemWithPrefix(kb.B); err != nil { if err == io.EOF { @@ -1021,6 +1068,8 @@ func (is *indexSearch) searchTSIDs(tfss []*TagFilters, tr TimeRange, maxMetrics // Obtain TSID values for the given metricIDs. tsids := make([]TSID, len(metricIDs)) i := 0 + accountID := tfss[0].accountID + projectID := tfss[0].projectID for _, metricID := range metricIDs { // Try obtaining TSIDs from db.tsidCache. This is much faster // than scanning the mergeset if it contains a lot of metricIDs. @@ -1034,7 +1083,7 @@ func (is *indexSearch) searchTSIDs(tfss []*TagFilters, tr TimeRange, maxMetrics if err != io.EOF { return nil, err } - if err := is.getTSIDByMetricID(tsid, metricID); err != nil { + if err := is.getTSIDByMetricID(&tsids[i], metricID, accountID, projectID); err != nil { if err == io.EOF { // Cannot find TSID for the given metricID. // This may be the case on incomplete indexDB @@ -1054,12 +1103,12 @@ func (is *indexSearch) searchTSIDs(tfss []*TagFilters, tr TimeRange, maxMetrics return tsids, nil } -func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64) error { +func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64, accountID, projectID uint32) error { // There is no need in checking for deleted metricIDs here, since they // must be checked by the caller. ts := &is.ts kb := &is.kb - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToTSID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToTSID, accountID, projectID) kb.B = encoding.MarshalUint64(kb.B, metricID) if err := ts.FirstItemWithPrefix(kb.B); err != nil { if err == io.EOF { @@ -1078,9 +1127,9 @@ func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64) error { return nil } -func getSeriesCount(ts *mergeset.TableSearch, kb *bytesutil.ByteBuffer) (uint64, error) { +func getSeriesCount(accountID, projectID uint32, ts *mergeset.TableSearch, kb *bytesutil.ByteBuffer) (uint64, error) { var n uint64 - kb.B = append(kb.B[:0], nsPrefixMetricIDToTSID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToTSID, accountID, projectID) ts.Seek(kb.B) for ts.NextItem() { if !bytes.HasPrefix(ts.Item, kb.B) { @@ -1097,7 +1146,7 @@ func getSeriesCount(ts *mergeset.TableSearch, kb *bytesutil.ByteBuffer) (uint64, // searchMetricIDsMapByMetricNameMatch matches metricName values for the given srcMetricIDs against tfs // and adds matching metrics to metricIDs. -func (is *indexSearch) searchMetricIDsMapByMetricNameMatch(metricIDs, srcMetricIDs map[uint64]struct{}, tfs []*tagFilter) error { +func (is *indexSearch) searchMetricIDsMapByMetricNameMatch(metricIDs, srcMetricIDs map[uint64]struct{}, tfs []*tagFilter, accountID, projectID uint32) error { // sort srcMetricIDs in order to speed up Seek below. sortedMetricIDs := make([]uint64, 0, len(srcMetricIDs)) for metricID := range srcMetricIDs { @@ -1111,7 +1160,7 @@ func (is *indexSearch) searchMetricIDsMapByMetricNameMatch(metricIDs, srcMetricI defer PutMetricName(mn) for _, metricID := range sortedMetricIDs { var err error - metricName.B, err = is.searchMetricName(metricName.B[:0], metricID) + metricName.B, err = is.searchMetricName(metricName.B[:0], metricID, accountID, projectID) if err != nil { return fmt.Errorf("cannot find metricName by metricID %d: %s", metricID, err) } @@ -1174,8 +1223,7 @@ func (is *indexSearch) getTagFilterWithMinMetricIDsMap(tfs *TagFilters, maxMetri } func matchTagFilters(mn *MetricName, tfs []*tagFilter, kb *bytesutil.ByteBuffer) (bool, error) { - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID) - + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID, mn.AccountID, mn.ProjectID) for _, tf := range tfs { if len(tf.key) == 0 { // Match against mn.MetricGroup. @@ -1322,7 +1370,7 @@ func (is *indexSearch) searchMetricIDsMap(metricIDs map[uint64]struct{}, tfs *Ta // Allow fetching up to 20*maxMetrics metrics for the given time range // in the hope these metricIDs will be filtered out by other filters below. maxTimeRangeMetrics := 20 * maxMetrics - metricIDsForTimeRange, err := is.getMetricIDsForTimeRange(tr, maxTimeRangeMetrics+1) + metricIDsForTimeRange, err := is.getMetricIDsForTimeRange(tr, maxTimeRangeMetrics+1, tfs.accountID, tfs.projectID) if err == errMissingMetricIDsForDate { // Give up. for metricID := range minMetricIDs { @@ -1364,7 +1412,7 @@ func (is *indexSearch) searchMetricIDsMap(metricIDs map[uint64]struct{}, tfs *Ta for i, tf := range tfsPostponed { mIDs, err := is.intersectMetricIDsMapForTagFilter(tf, minMetricIDs) if err == errFallbackToMetricNameMatch { - return is.searchMetricIDsMapByMetricNameMatch(metricIDs, minMetricIDs, tfsPostponed[i:]) + return is.searchMetricIDsMapByMetricNameMatch(metricIDs, minMetricIDs, tfsPostponed[i:], tfs.accountID, tfs.projectID) } if err != nil { return err @@ -1537,7 +1585,7 @@ var errFallbackToMetricNameMatch = errors.New("fall back to searchMetricIDsMapBy var errMissingMetricIDsForDate = errors.New("missing metricIDs for date") -func (is *indexSearch) getMetricIDsForTimeRange(tr TimeRange, maxMetrics int) (map[uint64]struct{}, error) { +func (is *indexSearch) getMetricIDsForTimeRange(tr TimeRange, maxMetrics int, accountID, projectID uint32) (map[uint64]struct{}, error) { if tr.isZero() { return nil, errMissingMetricIDsForDate } @@ -1549,7 +1597,7 @@ func (is *indexSearch) getMetricIDsForTimeRange(tr TimeRange, maxMetrics int) (m } metricIDs := make(map[uint64]struct{}, maxMetrics) for minDate <= maxDate { - if err := is.getMetricIDsForDate(uint64(minDate), metricIDs, maxMetrics); err != nil { + if err := is.getMetricIDsForDate(uint64(minDate), metricIDs, maxMetrics, accountID, projectID); err != nil { return nil, err } minDate++ @@ -1557,9 +1605,9 @@ func (is *indexSearch) getMetricIDsForTimeRange(tr TimeRange, maxMetrics int) (m return metricIDs, nil } -func (db *indexDB) storeDateMetricID(date, metricID uint64) error { +func (db *indexDB) storeDateMetricID(date, metricID uint64, accountID, projectID uint32) error { is := db.getIndexSearch() - ok, err := is.hasDateMetricID(date, metricID) + ok, err := is.hasDateMetricID(date, metricID, accountID, projectID) db.putIndexSearch(is) if err != nil { return err @@ -1571,7 +1619,7 @@ func (db *indexDB) storeDateMetricID(date, metricID uint64) error { // Slow path: create (date, metricID) entry. items := getIndexItems() - items.B = marshalCommonPrefix(items.B[:0], nsPrefixDateToMetricID) + items.B = marshalCommonPrefix(items.B[:0], nsPrefixDateToMetricID, accountID, projectID) items.B = encoding.MarshalUint64(items.B, date) items.B = encoding.MarshalUint64(items.B, metricID) items.Next() @@ -1580,10 +1628,10 @@ func (db *indexDB) storeDateMetricID(date, metricID uint64) error { return err } -func (is *indexSearch) hasDateMetricID(date, metricID uint64) (bool, error) { +func (is *indexSearch) hasDateMetricID(date, metricID uint64, accountID, projectID uint32) (bool, error) { ts := &is.ts kb := &is.kb - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixDateToMetricID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixDateToMetricID, accountID, projectID) kb.B = encoding.MarshalUint64(kb.B, date) kb.B = encoding.MarshalUint64(kb.B, metricID) if err := ts.FirstItemWithPrefix(kb.B); err != nil { @@ -1598,10 +1646,10 @@ func (is *indexSearch) hasDateMetricID(date, metricID uint64) (bool, error) { return true, nil } -func (is *indexSearch) getMetricIDsForDate(date uint64, metricIDs map[uint64]struct{}, maxMetrics int) error { +func (is *indexSearch) getMetricIDsForDate(date uint64, metricIDs map[uint64]struct{}, maxMetrics int, accountID, projectID uint32) error { ts := &is.ts kb := &is.kb - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixDateToMetricID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixDateToMetricID, accountID, projectID) kb.B = encoding.MarshalUint64(kb.B, date) ts.Seek(kb.B) items := 0 @@ -1733,8 +1781,10 @@ func getUniqueUint64() uint64 { // between VictoriaMetrics restarts. var uniqueUint64 = uint64(time.Now().UnixNano()) -func marshalCommonPrefix(dst []byte, nsPrefix byte) []byte { +func marshalCommonPrefix(dst []byte, nsPrefix byte, accountID, projectID uint32) []byte { dst = append(dst, nsPrefix) + dst = encoding.MarshalUint32(dst, accountID) + dst = encoding.MarshalUint32(dst, projectID) return dst } diff --git a/lib/storage/index_db_test.go b/lib/storage/index_db_test.go index 071859119..ba694b717 100644 --- a/lib/storage/index_db_test.go +++ b/lib/storage/index_db_test.go @@ -263,6 +263,8 @@ func testIndexDBGetOrCreateTSIDByName(db *indexDB, accountsCount, projectsCount, for i := 0; i < 4e2+1; i++ { var mn MetricName + mn.AccountID = uint32((i + 2) % accountsCount) + mn.ProjectID = uint32((i + 1) % projectsCount) // Init MetricGroup. mn.MetricGroup = []byte(fmt.Sprintf("metricGroup_%d\x00\x01\x02", i%metricGroups)) @@ -282,6 +284,12 @@ func testIndexDBGetOrCreateTSIDByName(db *indexDB, accountsCount, projectsCount, if err := is.GetOrCreateTSIDByName(&tsid, metricName); err != nil { return nil, nil, fmt.Errorf("unexpected error when creating tsid for mn:\n%s: %s", &mn, err) } + if tsid.AccountID != mn.AccountID { + return nil, nil, fmt.Errorf("unexpected TSID.AccountID; got %d; want %d; mn:\n%s\ntsid:\n%+v", tsid.AccountID, mn.AccountID, &mn, &tsid) + } + if tsid.ProjectID != mn.ProjectID { + return nil, nil, fmt.Errorf("unexpected TSID.ProjectID; got %d; want %d; mn:\n%s\ntsid:\n%+v", tsid.ProjectID, mn.ProjectID, &mn, &tsid) + } mns = append(mns, mn) tsids = append(tsids, tsid) @@ -302,15 +310,23 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC return false } - timeseriesCounters := make(map[uint64]bool) + allKeys := make(map[accountProjectKey]map[string]bool) + timeseriesCounters := make(map[accountProjectKey]map[uint64]bool) var tsidCopy TSID var metricNameCopy []byte - allKeys := make(map[string]bool) for i := range mns { mn := &mns[i] tsid := &tsids[i] - tc := timeseriesCounters + apKey := accountProjectKey{ + AccountID: tsid.AccountID, + ProjectID: tsid.ProjectID, + } + tc := timeseriesCounters[apKey] + if tc == nil { + tc = make(map[uint64]bool) + timeseriesCounters[apKey] = tc + } tc[tsid.MetricID] = true mn.sortTags() @@ -330,7 +346,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC // Search for metric name for the given metricID. var err error - metricNameCopy, err = db.searchMetricName(metricNameCopy[:0], tsidCopy.MetricID) + metricNameCopy, err = db.searchMetricName(metricNameCopy[:0], tsidCopy.MetricID, tsidCopy.AccountID, tsidCopy.ProjectID) if err != nil { return fmt.Errorf("error in searchMetricName: %s", err) } @@ -339,7 +355,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Try searching metric name for non-existent MetricID. - buf, err := db.searchMetricName(nil, 1) + buf, err := db.searchMetricName(nil, 1, mn.AccountID, mn.ProjectID) if err != io.EOF { return fmt.Errorf("expecting io.EOF error when searching for non-existing metricID; got %v", err) } @@ -348,37 +364,44 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Test SearchTagValues - tvs, err := db.SearchTagValues(nil, 1e5) + tvs, err := db.SearchTagValues(mn.AccountID, mn.ProjectID, nil, 1e5) if err != nil { return fmt.Errorf("error in SearchTagValues for __name__: %s", err) } if !hasValue(tvs, mn.MetricGroup) { return fmt.Errorf("SearchTagValues couldn't find %q; found %q", mn.MetricGroup, tvs) } + apKeys := allKeys[apKey] + if apKeys == nil { + apKeys = make(map[string]bool) + allKeys[apKey] = apKeys + } for i := range mn.Tags { tag := &mn.Tags[i] - tvs, err := db.SearchTagValues(tag.Key, 1e5) + tvs, err := db.SearchTagValues(mn.AccountID, mn.ProjectID, tag.Key, 1e5) if err != nil { return fmt.Errorf("error in SearchTagValues for __name__: %s", err) } if !hasValue(tvs, tag.Value) { return fmt.Errorf("SearchTagValues couldn't find %q=%q; found %q", tag.Key, tag.Value, tvs) } - allKeys[string(tag.Key)] = true + apKeys[string(tag.Key)] = true } } // Test SearchTagKeys - tks, err := db.SearchTagKeys(1e5) - if err != nil { - return fmt.Errorf("error in SearchTagKeys: %s", err) - } - if !hasValue(tks, nil) { - return fmt.Errorf("cannot find __name__ in %q", tks) - } - for key := range allKeys { - if !hasValue(tks, []byte(key)) { - return fmt.Errorf("cannot find %q in %q", key, tks) + for k, apKeys := range allKeys { + tks, err := db.SearchTagKeys(k.AccountID, k.ProjectID, 1e5) + if err != nil { + return fmt.Errorf("error in SearchTagKeys: %s", err) + } + if !hasValue(tks, nil) { + return fmt.Errorf("cannot find __name__ in %q", tks) + } + for key := range apKeys { + if !hasValue(tks, []byte(key)) { + return fmt.Errorf("cannot find %q in %q", key, tks) + } } } @@ -388,7 +411,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC tsid := &tsids[i] // Search without regexps. - tfs := NewTagFilters() + tfs := NewTagFilters(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, mn.MetricGroup, false, false); err != nil { return fmt.Errorf("cannot create tag filter for MetricGroup: %s", err) } @@ -434,7 +457,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Search with regexps. - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, mn.MetricGroup, false, true); err != nil { return fmt.Errorf("cannot create regexp tag filter for MetricGroup: %s", err) } @@ -472,7 +495,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Search with filter matching zero results. - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("non-existing-key"), []byte("foobar"), false, false); err != nil { return fmt.Errorf("cannot add non-existing key: %s", err) } @@ -493,8 +516,8 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC continue } - // Search with empty filter. It should match all the results. - tfs.Reset() + // Search with empty filter. It should match all the results for (accountID, projectID). + tfs.Reset(mn.AccountID, mn.ProjectID) tsidsFound, err = db.searchTSIDs([]*TagFilters{tfs}, TimeRange{}, 1e5) if err != nil { return fmt.Errorf("cannot search for common prefix: %s", err) @@ -504,7 +527,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Search with empty metricGroup. It should match zero results. - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, nil, false, false); err != nil { return fmt.Errorf("cannot create tag filter for empty metricGroup: %s", err) } @@ -517,11 +540,11 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Search with multiple tfss - tfs1 := NewTagFilters() + tfs1 := NewTagFilters(mn.AccountID, mn.ProjectID) if err := tfs1.Add(nil, nil, false, false); err != nil { return fmt.Errorf("cannot create tag filter for empty metricGroup: %s", err) } - tfs2 := NewTagFilters() + tfs2 := NewTagFilters(mn.AccountID, mn.ProjectID) if err := tfs2.Add(nil, mn.MetricGroup, false, false); err != nil { return fmt.Errorf("cannot create tag filter for MetricGroup: %s", err) } @@ -539,7 +562,7 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC return fmt.Errorf("cannot search for nil tfss: %s", err) } if len(tsidsFound) != 0 { - return fmt.Errorf("unexpected non-empty tsids fround for nil tfss; found %d tsids", len(tsidsFound)) + return fmt.Errorf("unexpected non-empty tsids fround for nil tfss") } } @@ -557,6 +580,8 @@ func testHasTSID(tsids []TSID, tsid *TSID) bool { func TestMatchTagFilters(t *testing.T) { var mn MetricName + mn.AccountID = 123 + mn.ProjectID = 456 mn.MetricGroup = append(mn.MetricGroup, "foobar_metric"...) for i := 0; i < 5; i++ { key := fmt.Sprintf("key %d", i) @@ -565,8 +590,8 @@ func TestMatchTagFilters(t *testing.T) { } var bb bytesutil.ByteBuffer - var tfs TagFilters - tfs.Reset() + // Verify tag filters for different account / project + tfs := NewTagFilters(mn.AccountID, mn.ProjectID+1) if err := tfs.Add(nil, []byte("foobar_metric"), false, false); err != nil { t.Fatalf("cannot add filter: %s", err) } @@ -574,12 +599,36 @@ func TestMatchTagFilters(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %s", err) } + if ok { + t.Fatalf("Tag filters shouldn't match for invalid projectID") + } + tfs.Reset(mn.AccountID+1, mn.ProjectID) + if err := tfs.Add(nil, []byte("foobar_metric"), false, false); err != nil { + t.Fatalf("cannot add filter: %s", err) + } + ok, err = matchTagFilters(&mn, toTFPointers(tfs.tfs), &bb) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + if ok { + t.Fatalf("Tag filters shouldn't match for invalid accountID") + } + + // Correct AccountID , ProjectID + tfs.Reset(mn.AccountID, mn.ProjectID) + if err := tfs.Add(nil, []byte("foobar_metric"), false, false); err != nil { + t.Fatalf("cannot add filter: %s", err) + } + ok, err = matchTagFilters(&mn, toTFPointers(tfs.tfs), &bb) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } if !ok { t.Fatalf("should match") } // Empty tag filters should match. - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) ok, err = matchTagFilters(&mn, toTFPointers(tfs.tfs), &bb) if err != nil { t.Fatalf("unexpected error: %s", err) @@ -589,7 +638,7 @@ func TestMatchTagFilters(t *testing.T) { } // Negative match by MetricGroup - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("foobar"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -600,7 +649,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("obar.+"), false, true); err != nil { t.Fatalf("cannot add regexp, no negative filter: %s", err) } @@ -611,7 +660,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("foobar_metric"), true, false); err != nil { t.Fatalf("cannot add no regexp, negative filter: %s", err) } @@ -622,7 +671,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("foob.+metric"), true, true); err != nil { t.Fatalf("cannot add regexp, negative filter: %s", err) } @@ -635,7 +684,7 @@ func TestMatchTagFilters(t *testing.T) { } // Positive match by MetricGroup - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("foobar_metric"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -646,7 +695,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("foobar.+etric"), false, true); err != nil { t.Fatalf("cannot add regexp, no negative filter: %s", err) } @@ -657,7 +706,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("obar_metric"), true, false); err != nil { t.Fatalf("cannot add no regexp, negative filter: %s", err) } @@ -668,7 +717,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add(nil, []byte("ob.+metric"), true, true); err != nil { t.Fatalf("cannot add regexp, negative filter: %s", err) } @@ -681,7 +730,7 @@ func TestMatchTagFilters(t *testing.T) { } // Negative match by non-existing tag - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("non-existing-tag"), []byte("foobar"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -692,7 +741,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("non-existing-tag"), []byte("obar.+"), false, true); err != nil { t.Fatalf("cannot add regexp, no negative filter: %s", err) } @@ -703,7 +752,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("non-existing-tag"), []byte("foobar_metric"), true, false); err != nil { t.Fatalf("cannot add no regexp, negative filter: %s", err) } @@ -714,7 +763,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("non-existing-tag"), []byte("foob.+metric"), true, true); err != nil { t.Fatalf("cannot add regexp, negative filter: %s", err) } @@ -727,7 +776,7 @@ func TestMatchTagFilters(t *testing.T) { } // Negative match by existing tag - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 0"), []byte("foobar"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -738,7 +787,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 1"), []byte("obar.+"), false, true); err != nil { t.Fatalf("cannot add regexp, no negative filter: %s", err) } @@ -749,7 +798,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 2"), []byte("value 2"), true, false); err != nil { t.Fatalf("cannot add no regexp, negative filter: %s", err) } @@ -760,7 +809,7 @@ func TestMatchTagFilters(t *testing.T) { if ok { t.Fatalf("Shouldn't match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 3"), []byte("v.+lue 3"), true, true); err != nil { t.Fatalf("cannot add regexp, negative filter: %s", err) } @@ -773,7 +822,7 @@ func TestMatchTagFilters(t *testing.T) { } // Positive match by existing tag - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 0"), []byte("value 0"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -784,7 +833,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 1"), []byte(".+lue 1"), false, true); err != nil { t.Fatalf("cannot add regexp, no negative filter: %s", err) } @@ -795,7 +844,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 2"), []byte("value 3"), true, false); err != nil { t.Fatalf("cannot add no regexp, negative filter: %s", err) } @@ -806,7 +855,7 @@ func TestMatchTagFilters(t *testing.T) { if !ok { t.Fatalf("Should match") } - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 3"), []byte("v.+lue 2"), true, true); err != nil { t.Fatalf("cannot add regexp, negative filter: %s", err) } @@ -819,7 +868,7 @@ func TestMatchTagFilters(t *testing.T) { } // Positive match by multiple tags and MetricGroup - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) if err := tfs.Add([]byte("key 0"), []byte("value 0"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) } @@ -853,7 +902,7 @@ func TestMatchTagFilters(t *testing.T) { } // Negative match by multiple tags and MetricGroup - tfs.Reset() + tfs.Reset(mn.AccountID, mn.ProjectID) // Positive matches if err := tfs.Add([]byte("key 0"), []byte("value 0"), false, false); err != nil { t.Fatalf("cannot add no regexp, no negative filter: %s", err) diff --git a/lib/storage/index_db_timing_test.go b/lib/storage/index_db_timing_test.go index e830a3777..dbf312e1a 100644 --- a/lib/storage/index_db_timing_test.go +++ b/lib/storage/index_db_timing_test.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "strconv" + "sync/atomic" "testing" "github.com/VictoriaMetrics/fastcache" @@ -28,12 +29,15 @@ func BenchmarkIndexDBAddTSIDs(b *testing.B) { } }() + var goroutineID uint32 + b.ReportAllocs() b.SetBytes(recordsPerLoop) b.ResetTimer() b.RunParallel(func(pb *testing.PB) { var mn MetricName var tsid TSID + mn.AccountID = atomic.AddUint32(&goroutineID, 1) // The most common tags. mn.Tags = []Tag{ @@ -105,6 +109,8 @@ func BenchmarkIndexDBSearchTSIDs(b *testing.B) { is := db.getIndexSearch() defer db.putIndexSearch(is) for i := 0; i < recordsCount; i++ { + mn.AccountID = uint32(i % accountsCount) + mn.ProjectID = uint32(i % projectsCount) mn.sortTags() metricName = mn.Marshal(metricName[:0]) if err := is.GetOrCreateTSIDByName(&tsid, metricName); err != nil { @@ -124,7 +130,9 @@ func BenchmarkIndexDBSearchTSIDs(b *testing.B) { tfss := []*TagFilters{&tfs} i := 0 for pb.Next() { - tfs.Reset() + accountID := uint32(i % accountsCount) + projectID := uint32(i % projectsCount) + tfs.Reset(accountID, projectID) for j := range tags { if err := tfs.Add(tags[j].Key, tags[j].Value, false, false); err != nil { panic(fmt.Errorf("BUG: unexpected error: %s", err)) @@ -178,6 +186,8 @@ func BenchmarkIndexDBGetTSIDs(b *testing.B) { is := db.getIndexSearch() defer db.putIndexSearch(is) for i := 0; i < recordsCount; i++ { + mn.AccountID = uint32(i % accountsCount) + mn.ProjectID = uint32(i % projectsCount) mn.sortTags() metricName = mn.Marshal(metricName[:0]) if err := is.GetOrCreateTSIDByName(&tsid, metricName); err != nil { @@ -196,6 +206,8 @@ func BenchmarkIndexDBGetTSIDs(b *testing.B) { defer db.putIndexSearch(is) for pb.Next() { for i := 0; i < recordsPerLoop; i++ { + mnLocal.AccountID = uint32(i % accountsCount) + mnLocal.ProjectID = uint32(i % projectsCount) mnLocal.sortTags() metricNameLocal = mnLocal.Marshal(metricNameLocal[:0]) if err := is.GetOrCreateTSIDByName(&tsidLocal, metricNameLocal); err != nil { diff --git a/lib/storage/metaindex_row_test.go b/lib/storage/metaindex_row_test.go index f55e755de..2f62f513d 100644 --- a/lib/storage/metaindex_row_test.go +++ b/lib/storage/metaindex_row_test.go @@ -11,6 +11,7 @@ func TestMetaindexRowReset(t *testing.T) { var mr metaindexRow mr.TSID.MetricID = 234 + mr.TSID.AccountID = 342 mr.BlockHeadersCount = 1323 mr.MinTimestamp = -234 mr.MaxTimestamp = 8989 diff --git a/lib/storage/metric_name.go b/lib/storage/metric_name.go index 15dcaf265..2d727f05b 100644 --- a/lib/storage/metric_name.go +++ b/lib/storage/metric_name.go @@ -113,6 +113,9 @@ func unmarshalTagValue(dst, src []byte) ([]byte, []byte, error) { // MetricName reperesents a metric name. type MetricName struct { + AccountID uint32 + ProjectID uint32 + MetricGroup []byte // Tags are optional. They must be sorted by tag Key for canonical view. @@ -139,12 +142,16 @@ var mnPool sync.Pool // Reset resets the mn. func (mn *MetricName) Reset() { + mn.AccountID = 0 + mn.ProjectID = 0 mn.MetricGroup = mn.MetricGroup[:0] mn.Tags = mn.Tags[:0] } // CopyFrom copies src to mn. func (mn *MetricName) CopyFrom(src *MetricName) { + mn.AccountID = src.AccountID + mn.ProjectID = src.ProjectID if cap(mn.MetricGroup) > 0 { mn.MetricGroup = append(mn.MetricGroup[:0], src.MetricGroup...) mn.Tags = copyTags(mn.Tags[:0], src.Tags) @@ -316,7 +323,7 @@ func (mn *MetricName) String() string { tags = append(tags, fmt.Sprintf("%q=%q", t.Key, t.Value)) } tagsStr := strings.Join(tags, ", ") - return fmt.Sprintf("MetricGroup=%q, tags=[%s]", mn.MetricGroup, tagsStr) + return fmt.Sprintf("AccountID=%d, ProjectID=%d, MetricGroup=%q, tags=[%s]", mn.AccountID, mn.ProjectID, mn.MetricGroup, tagsStr) } // Marshal appends marshaled mn to dst and returns the result. @@ -325,7 +332,7 @@ func (mn *MetricName) String() string { func (mn *MetricName) Marshal(dst []byte) []byte { // Calculate the required size and pre-allocate space in dst dstLen := len(dst) - requiredSize := len(mn.MetricGroup) + 1 + requiredSize := 8 + len(mn.MetricGroup) + 1 for i := range mn.Tags { tag := &mn.Tags[i] requiredSize += len(tag.Key) + len(tag.Value) + 2 @@ -333,16 +340,22 @@ func (mn *MetricName) Marshal(dst []byte) []byte { dst = bytesutil.Resize(dst, requiredSize) dst = dst[:dstLen] - // Marshal MetricGroup + dst = encoding.MarshalUint32(dst, mn.AccountID) + dst = encoding.MarshalUint32(dst, mn.ProjectID) dst = marshalTagValue(dst, mn.MetricGroup) - - // Marshal tags. dst = marshalTags(dst, mn.Tags) return dst } // Unmarshal unmarshals mn from src. func (mn *MetricName) Unmarshal(src []byte) error { + if len(src) < 8 { + return fmt.Errorf("too short src: %d bytes; must be at least % bytes", len(src), 8) + } + mn.AccountID = encoding.UnmarshalUint32(src) + mn.ProjectID = encoding.UnmarshalUint32(src[4:]) + src = src[8:] + // Unmarshal MetricGroup. var err error src, mn.MetricGroup, err = unmarshalTagValue(mn.MetricGroup[:0], src) @@ -393,10 +406,10 @@ const maxLabelsPerTimeseries = 30 // MarshalMetricNameRaw marshals labels to dst and returns the result. // // The result must be unmarshaled with MetricName.unmarshalRaw -func MarshalMetricNameRaw(dst []byte, labels []prompb.Label) []byte { +func MarshalMetricNameRaw(dst []byte, accountID, projectID uint32, labels []prompb.Label) []byte { // Calculate the required space for dst. dstLen := len(dst) - dstSize := dstLen + dstSize := dstLen + 8 for i := range labels { if i >= maxLabelsPerTimeseries { break @@ -422,6 +435,8 @@ func MarshalMetricNameRaw(dst []byte, labels []prompb.Label) []byte { dst = bytesutil.Resize(dst, dstSize)[:dstLen] // Marshal labels to dst. + dst = encoding.MarshalUint32(dst, accountID) + dst = encoding.MarshalUint32(dst, projectID) for i := range labels { if i >= maxLabelsPerTimeseries { break @@ -437,6 +452,13 @@ func MarshalMetricNameRaw(dst []byte, labels []prompb.Label) []byte { return dst } +// MarshalMetricLabelRaw marshals label to dst. +func MarshalMetricLabelRaw(dst []byte, label *prompb.Label) []byte { + dst = marshalBytesFast(dst, label.Name) + dst = marshalBytesFast(dst, label.Value) + return dst +} + // marshalRaw marshals mn to dst and returns the result. // // The results may be unmarshaled with MetricName.unmarshalRaw. @@ -444,6 +466,8 @@ func MarshalMetricNameRaw(dst []byte, labels []prompb.Label) []byte { // This function is for testing purposes. MarshalMetricNameRaw must be used // in prod instead. func (mn *MetricName) marshalRaw(dst []byte) []byte { + dst = encoding.MarshalUint32(dst, mn.AccountID) + dst = encoding.MarshalUint32(dst, mn.ProjectID) dst = marshalBytesFast(dst, nil) dst = marshalBytesFast(dst, mn.MetricGroup) @@ -459,6 +483,16 @@ func (mn *MetricName) marshalRaw(dst []byte) []byte { // unmarshalRaw unmarshals mn encoded with MarshalMetricNameRaw. func (mn *MetricName) unmarshalRaw(src []byte) error { mn.Reset() + if len(src) < 4 { + return fmt.Errorf("not enough data for decoding accountID; got %d bytes; %X; want at least 4 bytes", len(src), src) + } + mn.AccountID = encoding.UnmarshalUint32(src) + src = src[4:] + if len(src) < 4 { + return fmt.Errorf("not enough data for decoding projectID; got %d bytes; %X; want at least 4 bytes", len(src), src) + } + mn.ProjectID = encoding.UnmarshalUint32(src) + src = src[4:] for len(src) > 0 { tail, key, err := unmarshalBytesFast(src) if err != nil { diff --git a/lib/storage/metric_name_test.go b/lib/storage/metric_name_test.go index 65f2100f0..c518ee984 100644 --- a/lib/storage/metric_name_test.go +++ b/lib/storage/metric_name_test.go @@ -38,6 +38,8 @@ func TestMetricNameMarshalUnmarshal(t *testing.T) { for i := 0; i < 10; i++ { for tagsCount := 0; tagsCount < 10; tagsCount++ { var mn MetricName + mn.AccountID = uint32(i) + mn.ProjectID = uint32(i + 1) for j := 0; j < tagsCount; j++ { key := fmt.Sprintf("key_%d_%d_\x00\x01\x02", i, j) value := fmt.Sprintf("\x02\x00\x01value_%d_%d", i, j) @@ -80,6 +82,8 @@ func TestMetricNameMarshalUnmarshalRaw(t *testing.T) { for i := 0; i < 10; i++ { for tagsCount := 0; tagsCount < 10; tagsCount++ { var mn MetricName + mn.AccountID = uint32(i) + mn.ProjectID = uint32(tagsCount) for j := 0; j < tagsCount; j++ { key := fmt.Sprintf("key_%d_%d_\x00\x01\x02", i, j) value := fmt.Sprintf("\x02\x00\x01value_%d_%d", i, j) diff --git a/lib/storage/raw_row.go b/lib/storage/raw_row.go index 3d58c8e5e..7b506af08 100644 --- a/lib/storage/raw_row.go +++ b/lib/storage/raw_row.go @@ -59,6 +59,18 @@ func (rrs *rawRowsSort) Less(i, j int) bool { // Slow path - compare TSIDs. // Manually inline TSID.Less here, since the compiler doesn't inline it yet :( + if ta.AccountID < tb.AccountID { + return true + } + if ta.AccountID > tb.AccountID { + return false + } + if ta.ProjectID < tb.ProjectID { + return true + } + if ta.ProjectID > tb.ProjectID { + return false + } if ta.MetricGroupID < tb.MetricGroupID { return true } diff --git a/lib/storage/search.go b/lib/storage/search.go index f460f6426..468824f28 100644 --- a/lib/storage/search.go +++ b/lib/storage/search.go @@ -158,7 +158,7 @@ func (s *Search) NextMetricBlock() bool { for s.ts.NextBlock() { tsid := &s.ts.Block.bh.TSID var err error - s.MetricBlock.MetricName, err = s.storage.searchMetricName(s.MetricBlock.MetricName[:0], tsid.MetricID) + s.MetricBlock.MetricName, err = s.storage.searchMetricName(s.MetricBlock.MetricName[:0], tsid.MetricID, tsid.AccountID, tsid.ProjectID) if err != nil { if err == io.EOF { // Missing metricName for tsid.MetricID. Increment error counter and skip it. @@ -182,6 +182,8 @@ func (s *Search) NextMetricBlock() bool { // SearchQuery is used for sending search queries from vmselect to vmstorage. type SearchQuery struct { + AccountID uint32 + ProjectID uint32 MinTimestamp int64 MaxTimestamp int64 TagFilterss [][]TagFilter @@ -263,8 +265,8 @@ func (tf *TagFilter) Unmarshal(src []byte) ([]byte, error) { // String returns string representation of the search query. func (sq *SearchQuery) String() string { var bb bytesutil.ByteBuffer - fmt.Fprintf(&bb, "MinTimestamp=%s, MaxTimestamp=%s, TagFilters=[\n", - timestampToTime(sq.MinTimestamp), timestampToTime(sq.MaxTimestamp)) + fmt.Fprintf(&bb, "AccountID=%d, ProjectID=%d, MinTimestamp=%s, MaxTimestamp=%s, TagFilters=[\n", + sq.AccountID, sq.ProjectID, timestampToTime(sq.MinTimestamp), timestampToTime(sq.MaxTimestamp)) for _, tagFilters := range sq.TagFilterss { for _, tf := range tagFilters { fmt.Fprintf(&bb, "%s", tf.String()) @@ -277,6 +279,8 @@ func (sq *SearchQuery) String() string { // Marshal appends marshaled sq to dst and returns the result. func (sq *SearchQuery) Marshal(dst []byte) []byte { + dst = encoding.MarshalUint32(dst, sq.AccountID) + dst = encoding.MarshalUint32(dst, sq.ProjectID) dst = encoding.MarshalVarInt64(dst, sq.MinTimestamp) dst = encoding.MarshalVarInt64(dst, sq.MaxTimestamp) dst = encoding.MarshalVarUint64(dst, uint64(len(sq.TagFilterss))) @@ -291,6 +295,18 @@ func (sq *SearchQuery) Marshal(dst []byte) []byte { // Unmarshal unmarshals sq from src and returns the tail. func (sq *SearchQuery) Unmarshal(src []byte) ([]byte, error) { + if len(src) < 4 { + return src, fmt.Errorf("cannot unmarshal AccountID: too short src len: %d; must be at least %d bytes", len(src), 4) + } + sq.AccountID = encoding.UnmarshalUint32(src) + src = src[4:] + + if len(src) < 4 { + return src, fmt.Errorf("cannot unmarshal ProjectID: too short src len: %d; must be at least %d bytes", len(src), 4) + } + sq.ProjectID = encoding.UnmarshalUint32(src) + src = src[4:] + tail, minTs, err := encoding.UnmarshalVarInt64(src) if err != nil { return src, fmt.Errorf("cannot unmarshal MinTimestamp: %s", err) diff --git a/lib/storage/search_test.go b/lib/storage/search_test.go index f0bdfd4bd..4a0f1f432 100644 --- a/lib/storage/search_test.go +++ b/lib/storage/search_test.go @@ -38,6 +38,12 @@ func TestSearchQueryMarshalUnmarshal(t *testing.T) { if len(tail) > 0 { t.Fatalf("unexpected tail left after SearchQuery unmarshaling; tail (len=%d): %q", len(tail), tail) } + if sq1.AccountID != sq1.AccountID { + t.Fatalf("unexpected AccountID; got %d; want %d", sq2.AccountID, sq1.AccountID) + } + if sq2.ProjectID != sq1.ProjectID { + t.Fatalf("unexpected ProjectID; got %d; want %d", sq2.ProjectID, sq1.ProjectID) + } if sq1.MinTimestamp != sq2.MinTimestamp { t.Fatalf("unexpected MinTimestamp; got %d; want %d", sq2.MinTimestamp, sq1.MinTimestamp) } @@ -99,6 +105,7 @@ func TestSearch(t *testing.T) { startTimestamp -= startTimestamp % (1e3 * 3600 * 24) blockRowsCount := 0 for i := 0; i < rowsCount; i++ { + mn.AccountID = uint32(i % accountsCount) mn.MetricGroup = []byte(fmt.Sprintf("metric_%d", i%metricGroupsCount)) mr := &mrs[i] @@ -162,7 +169,7 @@ func testSearch(st *Storage, tr TimeRange, mrs []MetricRow, accountsCount int) e var s Search for i := 0; i < 10; i++ { // Prepare TagFilters for search. - tfs := NewTagFilters() + tfs := NewTagFilters(uint32(i%accountsCount), 0) metricGroupRe := fmt.Sprintf(`metric_\d*%d%d`, i, i) if err := tfs.Add(nil, []byte(metricGroupRe), false, true); err != nil { return fmt.Errorf("cannot add metricGroupRe=%q: %s", metricGroupRe, err) diff --git a/lib/storage/storage.go b/lib/storage/storage.go index b68d5db56..f809996d1 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -461,26 +461,26 @@ func (s *Storage) DeleteMetrics(tfss []*TagFilters) (int, error) { // searchMetricName appends metric name for the given metricID to dst // and returns the result. -func (s *Storage) searchMetricName(dst []byte, metricID uint64) ([]byte, error) { - return s.idb().searchMetricName(dst, metricID) +func (s *Storage) searchMetricName(dst []byte, metricID uint64, accountID, projectID uint32) ([]byte, error) { + return s.idb().searchMetricName(dst, metricID, accountID, projectID) } -// SearchTagKeys searches for tag keys -func (s *Storage) SearchTagKeys(maxTagKeys int) ([]string, error) { - return s.idb().SearchTagKeys(maxTagKeys) +// SearchTagKeys searches for tag keys for the given (accountID, projectID). +func (s *Storage) SearchTagKeys(accountID, projectID uint32, maxTagKeys int) ([]string, error) { + return s.idb().SearchTagKeys(accountID, projectID, maxTagKeys) } -// SearchTagValues searches for tag values for the given tagKey -func (s *Storage) SearchTagValues(tagKey []byte, maxTagValues int) ([]string, error) { - return s.idb().SearchTagValues(tagKey, maxTagValues) +// SearchTagValues searches for tag values for the given tagKey in (accountID, projectID). +func (s *Storage) SearchTagValues(accountID, projectID uint32, tagKey []byte, maxTagValues int) ([]string, error) { + return s.idb().SearchTagValues(accountID, projectID, tagKey, maxTagValues) } -// GetSeriesCount returns the approximate number of unique time series. +// GetSeriesCount returns the approximate number of unique time series for the given (accountID, projectID). // // It includes the deleted series too and may count the same series // up to two times - in db and extDB. -func (s *Storage) GetSeriesCount() (uint64, error) { - return s.idb().GetSeriesCount() +func (s *Storage) GetSeriesCount(accountID, projectID uint32) (uint64, error) { + return s.idb().GetSeriesCount(accountID, projectID) } // MetricRow is a metric to insert into storage. @@ -507,15 +507,19 @@ func (mr *MetricRow) String() string { if err := mn.unmarshalRaw(mr.MetricNameRaw); err == nil { metricName = mn.String() } - return fmt.Sprintf("MetricName=%s, Timestamp=%d, Value=%f\n", - metricName, mr.Timestamp, mr.Value) + return fmt.Sprintf("MetricName=%s, Timestamp=%d, Value=%f\n", metricName, mr.Timestamp, mr.Value) } // Marshal appends marshaled mr to dst and returns the result. func (mr *MetricRow) Marshal(dst []byte) []byte { - dst = encoding.MarshalBytes(dst, mr.MetricNameRaw) - dst = encoding.MarshalUint64(dst, uint64(mr.Timestamp)) - dst = encoding.MarshalUint64(dst, math.Float64bits(mr.Value)) + return MarshalMetricRow(dst, mr.MetricNameRaw, mr.Timestamp, mr.Value) +} + +// MarshalMetricRow marshals MetricRow data to dst and returns the result. +func MarshalMetricRow(dst []byte, metricNameRaw []byte, timestamp int64, value float64) []byte { + dst = encoding.MarshalBytes(dst, metricNameRaw) + dst = encoding.MarshalUint64(dst, uint64(timestamp)) + dst = encoding.MarshalUint64(dst, math.Float64bits(value)) return dst } @@ -688,7 +692,7 @@ func (s *Storage) updateDateMetricIDCache(rows []rawRow, errors []error) []error // It is OK if the (date, metricID) entry is added multiple times to db // by concurrent goroutines. s.dateMetricIDCache.Set(keyBuf, nil) - if err := idb.storeDateMetricID(date, metricID); err != nil { + if err := idb.storeDateMetricID(date, metricID, r.TSID.AccountID, r.TSID.ProjectID); err != nil { errors = append(errors, err) continue } diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go index 5dc2b1dd6..989e47864 100644 --- a/lib/storage/storage_test.go +++ b/lib/storage/storage_test.go @@ -194,7 +194,7 @@ func TestStorageDeleteMetrics(t *testing.T) { } // Verify no tag keys exist - tks, err := s.SearchTagKeys(1e5) + tks, err := s.SearchTagKeys(0, 0, 1e5) if err != nil { t.Fatalf("error in SearchTagKeys at the start: %s", err) } @@ -245,7 +245,7 @@ func TestStorageDeleteMetrics(t *testing.T) { }) // Verify no more tag keys exist - tks, err = s.SearchTagKeys(1e5) + tks, err = s.SearchTagKeys(0, 0, 1e5) if err != nil { t.Fatalf("error in SearchTagKeys after the test: %s", err) } @@ -264,12 +264,16 @@ func testStorageDeleteMetrics(s *Storage, workerNum int) error { const metricsCount = 30 workerTag := []byte(fmt.Sprintf("workerTag_%d", workerNum)) + accountID := uint32(workerNum) + projectID := uint32(123) tksAll := make(map[string]bool) tksAll[""] = true // __name__ for i := 0; i < metricsCount; i++ { var mrs []MetricRow var mn MetricName + mn.AccountID = accountID + mn.ProjectID = projectID job := fmt.Sprintf("job_%d_%d", i, workerNum) instance := fmt.Sprintf("instance_%d_%d", i, workerNum) mn.Tags = []Tag{ @@ -301,7 +305,7 @@ func testStorageDeleteMetrics(s *Storage, workerNum int) error { s.debugFlush() // Verify tag values exist - tvs, err := s.SearchTagValues(workerTag, 1e5) + tvs, err := s.SearchTagValues(accountID, projectID, workerTag, 1e5) if err != nil { return fmt.Errorf("error in SearchTagValues before metrics removal: %s", err) } @@ -310,7 +314,7 @@ func testStorageDeleteMetrics(s *Storage, workerNum int) error { } // Verify tag keys exist - tks, err := s.SearchTagKeys(1e5) + tks, err := s.SearchTagKeys(accountID, projectID, 1e5) if err != nil { return fmt.Errorf("error in SearchTagKeys before metrics removal: %s", err) } @@ -333,7 +337,7 @@ func testStorageDeleteMetrics(s *Storage, workerNum int) error { return n } for i := 0; i < metricsCount; i++ { - tfs := NewTagFilters() + tfs := NewTagFilters(accountID, projectID) if err := tfs.Add(nil, []byte("metric_.+"), false, true); err != nil { return fmt.Errorf("cannot add regexp tag filter: %s", err) } @@ -366,14 +370,14 @@ func testStorageDeleteMetrics(s *Storage, workerNum int) error { } // Make sure no more metrics left for the given workerNum - tfs := NewTagFilters() + tfs := NewTagFilters(accountID, projectID) if err := tfs.Add(nil, []byte(fmt.Sprintf("metric_.+_%d", workerNum)), false, true); err != nil { return fmt.Errorf("cannot add regexp tag filter for worker metrics: %s", err) } if n := metricBlocksCount(tfs); n != 0 { return fmt.Errorf("expecting zero metric blocks after deleting all the metrics; got %d blocks", n) } - tvs, err = s.SearchTagValues(workerTag, 1e5) + tvs, err = s.SearchTagValues(accountID, projectID, workerTag, 1e5) if err != nil { return fmt.Errorf("error in SearchTagValues after all the metrics are removed: %s", err) } @@ -451,6 +455,8 @@ func testStorageAddRows(s *Storage) error { {[]byte("instance"), []byte("1.2.3.4")}, } for j := 0; j < rowsPerAdd; j++ { + mn.AccountID = uint32(rand.Intn(2)) + mn.ProjectID = uint32(rand.Intn(3)) mn.MetricGroup = []byte(fmt.Sprintf("metric_%d", rand.Intn(100))) metricNameRaw := mn.marshalRaw(nil) timestamp := rand.Int63n(1e10) @@ -581,6 +587,8 @@ func testStorageAddMetrics(s *Storage, workerNum int) error { {[]byte("instance"), []byte("1.2.3.4")}, } for i := 0; i < rowsCount; i++ { + mn.AccountID = 123 + mn.ProjectID = uint32(i % 3) mn.MetricGroup = []byte(fmt.Sprintf("metric_%d_%d", workerNum, rand.Intn(10))) metricNameRaw := mn.marshalRaw(nil) timestamp := rand.Int63n(1e10) diff --git a/lib/storage/storage_timing_test.go b/lib/storage/storage_timing_test.go index abdd5f46b..3cc195cc0 100644 --- a/lib/storage/storage_timing_test.go +++ b/lib/storage/storage_timing_test.go @@ -44,6 +44,8 @@ func benchmarkStorageAddRows(b *testing.B, rowsPerBatch int) { for pb.Next() { offset := int(atomic.AddUint64(&globalOffset, uint64(rowsPerBatch))) for i := 0; i < rowsPerBatch; i++ { + mn.AccountID = uint32(i) + mn.ProjectID = uint32(i % 3) mr := &mrs[i] mr.MetricNameRaw = mn.marshalRaw(mr.MetricNameRaw[:0]) mr.Timestamp = int64(offset + i) diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index 66c1c5bd5..bccb966d0 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -16,17 +16,22 @@ import ( // TagFilters represents filters used for filtering tags. type TagFilters struct { + accountID uint32 + projectID uint32 + tfs []tagFilter // Common prefix for all the tag filters. - // Contains encoded nsPrefixTagToMetricID. + // Contains encoded nsPrefixTagToMetricID + accountID + projectID commonPrefix []byte } -// NewTagFilters returns new TagFilters. -func NewTagFilters() *TagFilters { +// NewTagFilters returns new TagFilters for the given accountID and projectID. +func NewTagFilters(accountID, projectID uint32) *TagFilters { return &TagFilters{ - commonPrefix: marshalCommonPrefix(nil, nsPrefixTagToMetricID), + accountID: accountID, + projectID: projectID, + commonPrefix: marshalCommonPrefix(nil, nsPrefixTagToMetricID, accountID, projectID), } } @@ -69,16 +74,19 @@ func (tfs *TagFilters) Add(key, value []byte, isNegative, isRegexp bool) error { // String returns human-readable value for tfs. func (tfs *TagFilters) String() string { var bb bytes.Buffer + fmt.Fprintf(&bb, "AccountID=%d, ProjectID=%d", tfs.accountID, tfs.projectID) for i := range tfs.tfs { fmt.Fprintf(&bb, ", %s", tfs.tfs[i].String()) } return bb.String() } -// Reset resets the tf -func (tfs *TagFilters) Reset() { +// Reset resets the tf for the given accountID and projectID +func (tfs *TagFilters) Reset(accountID, projectID uint32) { + tfs.accountID = accountID + tfs.projectID = projectID tfs.tfs = tfs.tfs[:0] - tfs.commonPrefix = marshalCommonPrefix(tfs.commonPrefix[:0], nsPrefixTagToMetricID) + tfs.commonPrefix = marshalCommonPrefix(tfs.commonPrefix[:0], nsPrefixTagToMetricID, accountID, projectID) } // tagFilter represents a filter used for filtering tags. @@ -88,7 +96,7 @@ type tagFilter struct { isNegative bool isRegexp bool - // Prefix always contains {nsPrefixTagToMetricID, key}. + // Prefix always contains {nsPrefixTagToMetricID, AccountID, ProjectID, key}. // Additionally it contains: // - value ending with tagSeparatorChar if !isRegexp. // - non-regexp prefix if isRegexp. @@ -110,9 +118,9 @@ func (tf *tagFilter) String() string { return bb.String() } -// Marshal appends marshaled tf to dst +// MarshalNoAccountIDProjectID appends marshaled tf to dst // and returns the result. -func (tf *tagFilter) Marshal(dst []byte) []byte { +func (tf *tagFilter) MarshalNoAccountIDProjectID(dst []byte) []byte { dst = marshalTagValue(dst, tf.key) dst = marshalTagValue(dst, tf.value) diff --git a/lib/storage/tag_filters_test.go b/lib/storage/tag_filters_test.go index 74dc705a3..897ea10ff 100644 --- a/lib/storage/tag_filters_test.go +++ b/lib/storage/tag_filters_test.go @@ -403,7 +403,7 @@ func testGetRegexpPrefix(t *testing.T, s, expectedPrefix, expectedSuffix string) } func TestTagFiltersAddEmpty(t *testing.T) { - tfs := NewTagFilters() + tfs := NewTagFilters(0, 0) mustAdd := func(key, value []byte, isNegative, isRegexp bool) { t.Helper() @@ -437,7 +437,7 @@ func TestTagFiltersAddEmpty(t *testing.T) { expectTagFilter(2, ".+", false, true) // Empty regexp filters - tfs.Reset() + tfs.Reset(0, 0) mustAdd([]byte("foo"), []byte(".*"), false, true) if len(tfs.tfs) != 0 { t.Fatalf("unexpectedly added empty regexp filter %s", &tfs.tfs[0]) @@ -450,7 +450,7 @@ func TestTagFiltersAddEmpty(t *testing.T) { expectTagFilter(2, "foo||bar", true, true) // Verify that otner filters are added normally. - tfs.Reset() + tfs.Reset(0, 0) mustAdd(nil, []byte("foobar"), false, false) if len(tfs.tfs) != 1 { t.Fatalf("missing added filter") diff --git a/lib/storage/tsid.go b/lib/storage/tsid.go index 210787a3b..c8d99ae59 100644 --- a/lib/storage/tsid.go +++ b/lib/storage/tsid.go @@ -14,9 +14,17 @@ import ( // grouping of related metrics. // It is OK if their meaning differ from their naming. type TSID struct { + // AccountID is the id of the registered account. + AccountID uint32 + + // ProjectID is the id of the project. + // + // The ProjectID must be unique for the given AccountID. + ProjectID uint32 + // MetricGroupID is the id of metric group inside the given project. // - // MetricGroupID must be unique. + // MetricGroupID must be unique for the given (AccountID, ProjectID). // // Metric group contains metrics with the identical name like // 'memory_usage', 'http_requests', but with different @@ -32,7 +40,7 @@ type TSID struct { // JobID is the id of an individual job (aka service) // for the given project. // - // JobID must be unique. + // JobID must be unique for the given (AccountID, ProjectID). // // Service may consist of multiple instances. // See https://prometheus.io/docs/concepts/jobs_instances/ for details. @@ -41,7 +49,7 @@ type TSID struct { // InstanceID is the id of an instance (aka process) // for the given project. // - // InstanceID must be unique. + // InstanceID must be unique for the given (AccountID, ProjectID). // // See https://prometheus.io/docs/concepts/jobs_instances/ for details. InstanceID uint32 @@ -61,6 +69,8 @@ var marshaledTSIDSize = func() int { // Marshal appends marshaled t to dst and returns the result. func (t *TSID) Marshal(dst []byte) []byte { + dst = encoding.MarshalUint32(dst, t.AccountID) + dst = encoding.MarshalUint32(dst, t.ProjectID) dst = encoding.MarshalUint64(dst, t.MetricGroupID) dst = encoding.MarshalUint32(dst, t.JobID) dst = encoding.MarshalUint32(dst, t.InstanceID) @@ -74,6 +84,10 @@ func (t *TSID) Unmarshal(src []byte) ([]byte, error) { return nil, fmt.Errorf("too short src; got %d bytes; want %d bytes", len(src), marshaledTSIDSize) } + t.AccountID = encoding.UnmarshalUint32(src) + src = src[4:] + t.ProjectID = encoding.UnmarshalUint32(src) + src = src[4:] t.MetricGroupID = encoding.UnmarshalUint64(src) src = src[8:] t.JobID = encoding.UnmarshalUint32(src) @@ -93,6 +107,18 @@ func (t *TSID) Less(b *TSID) bool { return false } + if t.AccountID < b.AccountID { + return true + } + if t.AccountID > b.AccountID { + return false + } + if t.ProjectID < b.ProjectID { + return true + } + if t.ProjectID > b.ProjectID { + return false + } if t.MetricGroupID < b.MetricGroupID { return true } diff --git a/lib/storage/tsid_test.go b/lib/storage/tsid_test.go index 7ceb6e2b1..20ca82502 100644 --- a/lib/storage/tsid_test.go +++ b/lib/storage/tsid_test.go @@ -13,7 +13,7 @@ func TestMarshaledTSIDSize(t *testing.T) { // This test makes sure marshaled format isn't changed. // If this test breaks then the storage format has been changed, // so it may become incompatible with the previously written data. - expectedSize := 24 + expectedSize := 32 if marshaledTSIDSize != expectedSize { t.Fatalf("unexpected marshaledTSIDSize; got %d; want %d", marshaledTSIDSize, expectedSize) } @@ -28,8 +28,27 @@ func TestTSIDLess(t *testing.T) { t.Fatalf("t2=%v cannot be less than t1=%v", &t2, &t1) } - t1.MetricID = 124 - t2.MetricID = 126 + t2.MetricID = 345 + t1.AccountID = 123 + if t1.Less(&t2) { + t.Fatalf("t1=%v cannot be less than t2=%v", &t1, &t2) + } + if !t2.Less(&t1) { + t.Fatalf("t2=%v must be less than t1=%v", &t2, &t1) + } + + t2 = t1 + t2.MetricID = 123 + t1.ProjectID = 8473 + if t1.Less(&t2) { + t.Fatalf("t1=%v cannot be less than t2=%v", &t1, &t2) + } + if !t2.Less(&t1) { + t.Fatalf("t2=%v must be less than t1=%v", &t2, &t1) + } + + t2 = t1 + t2.MetricID = 123 t1.MetricGroupID = 847 if t1.Less(&t2) { t.Fatalf("t1=%v cannot be less than t2=%v", &t1, &t2) diff --git a/vendor/github.com/lithammer/go-jump-consistent-hash/.travis.yml b/vendor/github.com/lithammer/go-jump-consistent-hash/.travis.yml new file mode 100644 index 000000000..a6d922c79 --- /dev/null +++ b/vendor/github.com/lithammer/go-jump-consistent-hash/.travis.yml @@ -0,0 +1,11 @@ +language: go + +go: + - 1.0 + - 1.1 + - 1.2 + - 1.3 + - 1.4 + - tip + +sudo: false diff --git a/vendor/github.com/lithammer/go-jump-consistent-hash/LICENSE b/vendor/github.com/lithammer/go-jump-consistent-hash/LICENSE new file mode 100644 index 000000000..9cc753370 --- /dev/null +++ b/vendor/github.com/lithammer/go-jump-consistent-hash/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Peter Renström + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/lithammer/go-jump-consistent-hash/README.md b/vendor/github.com/lithammer/go-jump-consistent-hash/README.md new file mode 100644 index 000000000..0f3a833fa --- /dev/null +++ b/vendor/github.com/lithammer/go-jump-consistent-hash/README.md @@ -0,0 +1,22 @@ +# Jump Consistent Hash + +[![Build Status](https://travis-ci.org/renstrom/go-jump-consistent-hash.svg?branch=master)](https://travis-ci.org/renstrom/go-jump-consistent-hash) +[![Godoc](https://img.shields.io/badge/godoc-reference-blue.svg?style=flat)](https://godoc.org/github.com/renstrom/go-jump-consistent-hash) + +Go implementation of the jump consistent hash algorithm[1] by John Lamping and Eric Veach. + +[1] http://arxiv.org/pdf/1406.2294v1.pdf + +## Usage + +```go +import jump "github.com/renstrom/go-jump-consistent-hash" + +func main() { + jump.Hash(256, 1024) // 520 +} +``` + +## License + +MIT diff --git a/vendor/github.com/lithammer/go-jump-consistent-hash/doc.go b/vendor/github.com/lithammer/go-jump-consistent-hash/doc.go new file mode 100644 index 000000000..309751588 --- /dev/null +++ b/vendor/github.com/lithammer/go-jump-consistent-hash/doc.go @@ -0,0 +1,131 @@ +// Example +// +// jump.Hash(256, 1024) // 520 +// +// Reference C++ implementation[1] +// +// int32_t JumpConsistentHash(uint64_t key, int32_t num_buckets) { +// int64_t b = -1, j = 0; +// while (j < num_buckets) { +// b = j; +// key = key * 2862933555777941757ULL + 1; +// j = (b + 1) * (double(1LL << 31) / double((key >> 33) + 1)); +// } +// return b; +// } +// +// Explanation of the algorithm +// +// Jump consistent hash works by computing when its output changes as the +// number of buckets increases. Let ch(key, num_buckets) be the consistent hash +// for the key when there are num_buckets buckets. Clearly, for any key, k, +// ch(k, 1) is 0, since there is only the one bucket. In order for the +// consistent hash function to balanced, ch(k, 2) will have to stay at 0 for +// half the keys, k, while it will have to jump to 1 for the other half. In +// general, ch(k, n+1) has to stay the same as ch(k, n) for n/(n+1) of the +// keys, and jump to n for the other 1/(n+1) of the keys. +// +// Here are examples of the consistent hash values for three keys, k1, k2, and +// k3, as num_buckets goes up: +// +// │ 1 │ 2 │ 3 │ 4 │ 5 │ 6 │ 7 │ 8 │ 9 │ 10 │ 11 │ 12 │ 13 │ 14 +// ───┼───┼───┼───┼───┼───┼───┼───┼───┼───┼────┼────┼────┼────┼──── +// k1 │ 0 │ 0 │ 2 │ 2 │ 4 │ 4 │ 4 │ 4 │ 4 │ 4 │ 4 │ 4 │ 4 │ 4 +// ───┼───┼───┼───┼───┼───┼───┼───┼───┼───┼────┼────┼────┼────┼──── +// k2 │ 0 │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ 7 │ 7 │ 7 │ 7 │ 7 │ 7 │ 7 +// ───┼───┼───┼───┼───┼───┼───┼───┼───┼───┼────┼────┼────┼────┼──── +// k3 │ 0 │ 1 │ 1 │ 1 │ 1 │ 5 │ 5 │ 7 │ 7 │ 7 │ 10 │ 10 │ 10 │ 10 +// +// A linear time algorithm can be defined by using the formula for the +// probability of ch(key, j) jumping when j increases. It essentially walks +// across a row of this table. Given a key and number of buckets, the algorithm +// considers each successive bucket, j, from 1 to num_buckets­1, and uses +// ch(key, j) to compute ch(key, j+1). At each bucket, j, it decides whether to +// keep ch(k, j+1) the same as ch(k, j), or to jump its value to j. In order to +// jump for the right fraction of keys, it uses a pseudo­random number +// generator with the key as its seed. To jump for 1/(j+1) of keys, it +// generates a uniform random number between 0.0 and 1.0, and jumps if the +// value is less than 1/(j+1). At the end of the loop, it has computed +// ch(k, num_buckets), which is the desired answer. In code: +// +// int ch(int key, int num_buckets) { +// random.seed(key); +// int b = 0; // This will track ch(key,j+1). +// for (int j = 1; j < num_buckets; j++) { +// if (random.next() < 1.0 / (j + 1)) b = j; +// } +// return b; +// } +// +// We can convert this to a logarithmic time algorithm by exploiting that +// ch(key, j+1) is usually unchanged as j increases, only jumping occasionally. +// The algorithm will only compute the destinations of jumps ­­ the j’s for +// which ch(key, j+1) ≠ ch(key, j). Also notice that for these j’s, ch(key, +// j+1) = j. To develop the algorithm, we will treat ch(key, j) as a random +// variable, so that we can use the notation for random variables to analyze +// the fractions of keys for which various propositions are true. That will +// lead us to a closed form expression for a pseudo­random variable whose value +// gives the destination of the next jump. +// +// Suppose that the algorithm is tracking the bucket numbers of the jumps for a +// particular key, k. And suppose that b was the destination of the last jump, +// that is, ch(k, b) ≠ ch(k, b+1), and ch(k, b+1) = b. Now, we want to find the +// next jump, the smallest j such that ch(k, j+1) ≠ ch(k, b+1), or +// equivalently, the largest j such that ch(k, j) = ch(k, b+1). We will make a +// pseudo­random variable whose value is that j. To get a probabilistic +// constraint on j, note that for any bucket number, i, we have j ≥ i if and +// only if the consistent hash hasn’t changed by i, that is, if and only if +// ch(k, i) = ch(k, b+1). Hence, the distribution of j must satisfy +// +// P(j ≥ i) = P( ch(k, i) = ch(k, b+1) ) +// +// Fortunately, it is easy to compute that probability. Notice that since P( +// ch(k, 10) = ch(k, 11) ) is 10/11, and P( ch(k, 11) = ch(k, 12) ) is 11/12, +// then P( ch(k, 10) = ch(k, 12) ) is 10/11 * 11/12 = 10/12. In general, if n ≥ +// m, P( ch(k, n) = ch(k, m) ) = m / n. Thus for any i > b, +// +// P(j ≥ i) = P( ch(k, i) = ch(k, b+1) ) = (b+1) / i . +// +// Now, we generate a pseudo­random variable, r, (depending on k and j) that is +// uniformly distributed between 0 and 1. Since we want P(j ≥ i) = (b+1) / i, +// we set P(j ≥ i) iff r ≤ (b+1) / i. Solving the inequality for i yields P(j ≥ +// i) iff i ≤ (b+1) / r. Since i is a lower bound on j, j will equal the +// largest i for which P(j ≥ i), thus the largest i satisfying i ≤ (b+1) / r. +// Thus, by the definition of the floor function, j = floor((b+1) / r). +// +// Using this formula, jump consistent hash finds ch(key, num_buckets) by +// choosing successive jump destinations until it finds a position at or past +// num_buckets. It then knows that the previous jump destination is the answer. +// +// int ch(int key, int num_buckets) { +// random.seed(key); +// int b = -1; // bucket number before the previous jump +// int j = 0; // bucket number before the current jump +// while (j < num_buckets) { +// b = j; +// r = random.next(); +// j = floor((b + 1) / r); +// } +// return = b; +// } +// +// To turn this into the actual code of figure 1, we need to implement random. +// We want it to be fast, and yet to also to have well distributed successive +// values. We use a 64­bit linear congruential generator; the particular +// multiplier we use produces random numbers that are especially well +// distributed in higher dimensions (i.e., when successive random values are +// used to form tuples). We use the key as the seed. (For keys that don’t fit +// into 64 bits, a 64 bit hash of the key should be used.) The congruential +// generator updates the seed on each iteration, and the code derives a double +// from the current seed. Tests show that this generator has good speed and +// distribution. +// +// It is worth noting that unlike the algorithm of Karger et al., jump +// consistent hash does not require the key to be hashed if it is already an +// integer. This is because jump consistent hash has an embedded pseudorandom +// number generator that essentially rehashes the key on every iteration. The +// hash is not especially good (i.e., linear congruential), but since it is +// applied repeatedly, additional hashing of the input key is not necessary. +// +// [1] http://arxiv.org/pdf/1406.2294v1.pdf +package jump diff --git a/vendor/github.com/lithammer/go-jump-consistent-hash/jump.go b/vendor/github.com/lithammer/go-jump-consistent-hash/jump.go new file mode 100644 index 000000000..fb62c665e --- /dev/null +++ b/vendor/github.com/lithammer/go-jump-consistent-hash/jump.go @@ -0,0 +1,19 @@ +package jump + +// Hash takes a 64 bit key and the number of buckets. It outputs a bucket +// number in the range [0, buckets]. +func Hash(key uint64, buckets int32) int32 { + var b, j int64 + + if buckets <= 0 { + buckets = 1 + } + + for j < int64(buckets) { + b = j + key = key*2862933555777941757 + 1 + j = int64(float64(b+1) * (float64(int64(1)<<31) / float64((key>>33)+1))) + } + + return int32(b) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index c1942d131..fb074a664 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -6,6 +6,8 @@ github.com/VictoriaMetrics/metrics github.com/cespare/xxhash/v2 # github.com/golang/snappy v0.0.1 github.com/golang/snappy +# github.com/lithammer/go-jump-consistent-hash v1.0.0 +github.com/lithammer/go-jump-consistent-hash # github.com/valyala/bytebufferpool v1.0.0 github.com/valyala/bytebufferpool # github.com/valyala/fastjson v1.4.1