app/vmstorage: add vm_slow_metric_name_loads_total metric, which could be used as an indicator when more RAM is needed for improving query performance

This commit is contained in:
Aliaksandr Valialkin 2020-05-15 14:11:39 +03:00
parent 82ccdfaa91
commit 82ffbcb9a6
4 changed files with 21 additions and 8 deletions

View file

@ -911,8 +911,11 @@ The most interesting metrics are:
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total size of data on disk. * `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
for the current number of active time series. of the current number of active time series.
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
### Troubleshooting ### Troubleshooting
@ -925,9 +928,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM. then it is likely you have too many active time series for the current amount of RAM.
See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance in this case. ingestion and query performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage. option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

View file

@ -415,6 +415,9 @@ func registerStorageMetrics() {
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 { metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
return float64(m().SlowPerDayIndexInserts) return float64(m().SlowPerDayIndexInserts)
}) })
metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
return float64(m().SlowMetricNameLoads)
})
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 { metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsCount) return float64(tm().BigRowsCount)

View file

@ -911,8 +911,11 @@ The most interesting metrics are:
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total size of data on disk. * `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
for the current number of active time series. of the current number of active time series.
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
### Troubleshooting ### Troubleshooting
@ -925,9 +928,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM. then it is likely you have too many active time series for the current amount of RAM.
See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance in this case. ingestion and query performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage. option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

View file

@ -41,6 +41,7 @@ type Storage struct {
slowRowInserts uint64 slowRowInserts uint64
slowPerDayIndexInserts uint64 slowPerDayIndexInserts uint64
slowMetricNameLoads uint64
path string path string
cachePath string cachePath string
@ -328,6 +329,7 @@ type Metrics struct {
SlowRowInserts uint64 SlowRowInserts uint64
SlowPerDayIndexInserts uint64 SlowPerDayIndexInserts uint64
SlowMetricNameLoads uint64
TSIDCacheSize uint64 TSIDCacheSize uint64
TSIDCacheSizeBytes uint64 TSIDCacheSizeBytes uint64
@ -385,6 +387,7 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts) m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts) m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
m.SlowMetricNameLoads += atomic.LoadUint64(&s.slowMetricNameLoads)
var cs fastcache.Stats var cs fastcache.Stats
s.tsidCache.UpdateStats(&cs) s.tsidCache.UpdateStats(&cs)
@ -814,6 +817,7 @@ func (s *Storage) prefetchMetricNames(tsids []TSID) error {
} }
metricIDs = append(metricIDs, metricID) metricIDs = append(metricIDs, metricID)
} }
atomic.AddUint64(&s.slowMetricNameLoads, uint64(len(metricIDs)))
if len(metricIDs) < 500 { if len(metricIDs) < 500 {
// It is cheaper to skip pre-fetching and obtain metricNames inline. // It is cheaper to skip pre-fetching and obtain metricNames inline.
return nil return nil