app/vmstorage: add vm_slow_row_inserts_total and vm_slow_per_day_index_inserts_total metrics for determining whether VictoriaMetrics required more RAM for the current number of active time series

This commit is contained in:
Aliaksandr Valialkin 2020-05-15 13:44:23 +03:00
parent 2c4d05db10
commit d6b9a49481
3 changed files with 25 additions and 3 deletions

View file

@ -357,6 +357,13 @@ func registerStorageMetrics(strg *storage.Storage) {
return float64(m().AddRowsConcurrencyCurrent)
})
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
return float64(m().SlowRowInserts)
})
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
return float64(m().SlowPerDayIndexInserts)
})
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsCount)
})

View file

@ -910,6 +910,9 @@ The most interesting metrics are:
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
for the current number of active time series.
### Troubleshooting
@ -922,8 +925,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM.
See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance.
ingestion performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

View file

@ -39,6 +39,9 @@ type Storage struct {
addRowsConcurrencyLimitTimeout uint64
addRowsConcurrencyDroppedRows uint64
slowRowInserts uint64
slowPerDayIndexInserts uint64
path string
cachePath string
retentionMonths int
@ -333,6 +336,9 @@ type Metrics struct {
AddRowsConcurrencyCapacity uint64
AddRowsConcurrencyCurrent uint64
SlowRowInserts uint64
SlowPerDayIndexInserts uint64
TSIDCacheSize uint64
TSIDCacheSizeBytes uint64
TSIDCacheRequests uint64
@ -387,6 +393,9 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh))
m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh))
m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
var cs fastcache.Stats
s.tsidCache.UpdateStats(&cs)
m.TSIDCacheSize += cs.EntriesCount
@ -1161,6 +1170,7 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
}
}
if pmrs != nil {
atomic.AddUint64(&s.slowRowInserts, uint64(len(pmrs.pmrs)))
// Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below.
pendingMetricRows := pmrs.pmrs
sort.Slice(pendingMetricRows, func(i, j int) bool {
@ -1371,6 +1381,7 @@ func (s *Storage) updatePerDateData(rows []rawRow) error {
// Slow path - add new (date, metricID) entries to indexDB.
atomic.AddUint64(&s.slowPerDayIndexInserts, uint64(len(pendingDateMetricIDs)))
// Sort pendingDateMetricIDs by (accountID, projectID, date, metricID) in order to speed up `is` search in the loop below.
sort.Slice(pendingDateMetricIDs, func(i, j int) bool {
a := pendingDateMetricIDs[i]