[draft] per tenant statistic (#121)

* [draft] per tenant statistic

* updates metric name
update graph
adds link and example config

* quick fix

* adds grafana dashboard
adds example alert

Co-authored-by: f41gh7 <nik@victoriametrics.com>
This commit is contained in:
Artem Navoiev 2021-04-14 11:17:52 +03:00 committed by Aliaksandr Valialkin
parent 251747e253
commit e9ee2122df
3 changed files with 743 additions and 0 deletions

View file

@ -0,0 +1,675 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Overview for enterprise cluster VictoriaMetrics v1.56.0 or higher",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 13,
"iteration": 1617980754279,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
"description": "How many datapoints are inserted into storage per second by accountID and projectID",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 2,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.4",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(vm_tenant_inserted_rows_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by (accountID,projectID) ",
"interval": "",
"legendFormat": "inserted rows: {{accountID}}:{{projectID}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Datapoints ingestion rate ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
"description": "Request rate accepted by vmselect nodes per tenant",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.4",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(vm_tenant_select_requests_total{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[5m])) by (accountID,projectID) ",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "tenant: {{accountID}}{{projectID}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Query rate ($instance)",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
"description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"hiddenSeries": false,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
{
"targetBlank": true,
"title": "troubleshooting",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting"
}
],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.4",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(vm_tenant_active_timeseries{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Active time series tenant: {{accountID}}:{{projectID}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Active time series ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
"description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"hiddenSeries": false,
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.4",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(vm_tenant_timeseries_created_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by(accountID,projectID)",
"interval": "",
"legendFormat": "churn rate tenant: {{accountID}}:{{projectID}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Churn rate ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
"description": "Shows amount of on-disk space occupied by data points.",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"hiddenSeries": false,
"id": 10,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.4",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(vm_tenant_used_tenant_bytes{job=\"$job_storage\", instance=~\"$instance\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{accountID}}:{{projectID}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Disk space usage (datapoints) ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 26,
"style": "dark",
"tags": [
"VictoriaMetrics",
"monitoring"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "gw",
"value": "gw"
},
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "ds",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$ds",
"definition": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)",
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "job",
"options": [],
"query": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$ds",
"definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$ds",
"definition": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)",
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": false,
"name": "accountID",
"options": [],
"query": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$ds",
"definition": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)",
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": false,
"name": "projectID",
"options": [],
"query": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "VictoriaMetrics cluster per tenant Copy",
"uid": "IZFqd3lMz",
"version": 1
}

View file

@ -0,0 +1,68 @@
## Victoria Metrics Cluster Per Tenant Statistic
<img alt="cluster-per-tenant-stat" src="per-tenant-stats.jpg">
Enterprise version of Victoria Metrics Cluster exposes usage statistic for each tenant.
The next statistic is exposed:
- `vminsert`
* `vm_tenant_inserted_rows_total` - the ingestion rate by tenant
- `vmselect`
* `vm_tenant_select_requests_duration_ms_total` - query latency by tenant. It can be usefull for identifing tenant with the most heaviest queries
* `vm_tenant_select_requests_total` - total requests. You can calculate request rate (qps) by using this metric
- `vmstorage`
* `vm_tenant_active_timeseries` - the number of active timeseries
* `vm_tenant_used_tenant_bytes` - the disk spaces is consumed by metrics for particular tenant
* `vm_tenant_timeseries_created_total` - the total number for timeseries by tenant
The information should be scrapped by the agent (`vmagent`, `victoriametrics`, prometheus, etc) and stored in TSDB. This can be the same cluster but a different tenant, but we encourage to use one more instance of TSDB (more lightweight, eg. VM single) for monitoring of monitoring.
the config example for statistic scrapping
```yaml
scrape_configs:
- job_name: cluster
scrape_interval: 10s
static_configs:
- targets: ['vmselect:8481','vmstorage:8482','vminsert:8480']
```
### Visualization
Visualisation of statistic can be done in grafana using this dashboard [link](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster/dashboards/clusterbytenant.json)
### Integration with vmgateway:
Per Tenant Statistic is the source data for `vmgateway` rate limiter. More information can be found [here](https://victoriametrics.github.io/vmgateway.html)
### Integration with vmalert:
You can generate alerts based on each tenant resource usage and notify the system/people about reaching the limits.
here is an example of alert by high churn rate by the tenant
```yaml
- alert: TooHighChurnRate
expr: |
(
sum(rate(vm_tenant_timeseries_created_total[5m])) by(accountID,projectID)
/
sum(rate(vm_tenant_inserted_rows_total[5m])) by(accountID,projectID)
) > 0.1
for: 15m
labels:
severity: warning
annotations:
summary: "Churn rate is more than 10% for the last 15m"
description: "VM constantly creates new time series at tenant: {{ $labels.accountID }}:{{ $labels.projectID }}.\n
This effect is known as Churn Rate.\n
High Churn Rate tightly connected with database performance and may
result in unexpected OOM's or slow queries."
```

BIN
docs/per-tenant-stats.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 82 KiB