lib/storage: improve performance for indexSearch.containsTimeRange()

The indexSearch.containsTimeRange() function is called for the current indexDB and the previous indexDB
every time when searching for metricIDs by label filters. This function consumes a lot of additional CPU time
for cases when queries with lightweight label filters are sent to VictoriaMetrics at high rate (e.g. thousands of RPS),
like in the issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7009 .

Optimize indexSearch.containsTimeRange() function in the following ways:

- Unconditionally return true if this function is called for the current indexDB, since there are very high
  chances that the current indexDB contains the data with timestamps in the requested time range.

- Cache the minimum timestamp, which is missing in the indexed data for the previous indexDB.
  This is safe to do, since the previous indexDB is readonly.
  This optimization eliminates potentially slow lookup in the previous indexDB for typical
  use cases when the requested time range is close to the current time.
This commit is contained in:
Aliaksandr Valialkin 2024-09-20 13:07:17 +02:00
parent 6f61e9d49d
commit 787b9cd9a0
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB

View file

@ -91,6 +91,14 @@ type indexDB struct {
// The db must be automatically recovered after that. // The db must be automatically recovered after that.
missingMetricNamesForMetricID atomic.Uint64 missingMetricNamesForMetricID atomic.Uint64
// minMissingTimestamp is the minimum timestamp, which is missing in the given indexDB.
//
// This field is used at containsTimeRange() function only for the previous indexDB,
// since this indexDB is readonly.
// This field cannot be used for the current indexDB, since it may receive data
// with bigger timestamps at any time.
minMissingTimestamp atomic.Int64
// generation identifies the index generation ID // generation identifies the index generation ID
// and is used for syncing items from different indexDBs // and is used for syncing items from different indexDBs
generation uint64 generation uint64
@ -274,6 +282,14 @@ func (db *indexDB) doExtDB(f func(extDB *indexDB)) {
} }
} }
// hasExtDB returns true if db.extDB != nil
func (db *indexDB) hasExtDB() bool {
db.extDBLock.Lock()
ok := db.extDB != nil
db.extDBLock.Unlock()
return ok
}
// SetExtDB sets external db to search. // SetExtDB sets external db to search.
// //
// It decrements refCount for the previous extDB. // It decrements refCount for the previous extDB.
@ -1904,13 +1920,37 @@ func (is *indexSearch) searchMetricName(dst []byte, metricID uint64) ([]byte, bo
return dst, true return dst, true
} }
func (is *indexSearch) containsTimeRange(tr TimeRange) (bool, error) { func (is *indexSearch) containsTimeRange(tr TimeRange) bool {
db := is.db
if db.hasExtDB() {
// The db corresponds to the current indexDB, which is used for storing index data for newly registered time series.
// This means that it may contain data for the given tr with probability close to 100%.
return true
}
// The db corresponds to the previous indexDB, which is readonly.
// So it is safe caching the minimum timestamp, which isn't covered by the db.
minMissingTimestamp := db.minMissingTimestamp.Load()
if minMissingTimestamp != 0 && tr.MinTimestamp >= minMissingTimestamp {
return false
}
if is.containsTimeRangeSlow(tr) {
return true
}
db.minMissingTimestamp.CompareAndSwap(minMissingTimestamp, tr.MinTimestamp)
return false
}
func (is *indexSearch) containsTimeRangeSlow(tr TimeRange) bool {
ts := &is.ts ts := &is.ts
kb := &is.kb kb := &is.kb
// Verify whether the tr.MinTimestamp is included into `ts` or is smaller than the minimum date stored in `ts`. // Verify whether the tr.MinTimestamp is included into `ts` or is smaller than the minimum date stored in `ts`.
// Do not check whether tr.MaxTimestamp is included into `ts` or is bigger than the max date stored in `ts` for performance reasons. // Do not check whether tr.MaxTimestamp is included into `ts` or is bigger than the max date stored in `ts` for performance reasons.
// This means that containsTimeRange() can return true if `tr` is located below the min date stored in `ts`. // This means that containsTimeRangeSlow() can return true if `tr` is located below the min date stored in `ts`.
// This is OK, since this case isn't encountered too much in practice. // This is OK, since this case isn't encountered too much in practice.
// The main practical case allows skipping searching in prev indexdb (`ts`) when `tr` // The main practical case allows skipping searching in prev indexdb (`ts`) when `tr`
// is located above the max date stored there. // is located above the max date stored there.
@ -1921,15 +1961,15 @@ func (is *indexSearch) containsTimeRange(tr TimeRange) (bool, error) {
ts.Seek(kb.B) ts.Seek(kb.B)
if !ts.NextItem() { if !ts.NextItem() {
if err := ts.Error(); err != nil { if err := ts.Error(); err != nil {
return false, fmt.Errorf("error when searching for minDate=%d, prefix %q: %w", minDate, kb.B, err) logger.Panicf("FATAL: error when searching for minDate=%d, prefix %q: %w", minDate, kb.B, err)
} }
return false, nil return false
} }
if !bytes.HasPrefix(ts.Item, prefix) { if !bytes.HasPrefix(ts.Item, prefix) {
// minDate exceeds max date from ts. // minDate exceeds max date from ts.
return false, nil return false
} }
return true, nil return true
} }
func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64) bool { func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64) bool {
@ -2225,11 +2265,7 @@ func (is *indexSearch) searchMetricIDsInternal(qt *querytracer.Tracer, tfss []*T
metricIDs := &uint64set.Set{} metricIDs := &uint64set.Set{}
ok, err := is.containsTimeRange(tr) if !is.containsTimeRange(tr) {
if err != nil {
return nil, err
}
if !ok {
qt.Printf("indexdb doesn't contain data for the given timeRange=%s", &tr) qt.Printf("indexdb doesn't contain data for the given timeRange=%s", &tr)
return metricIDs, nil return metricIDs, nil
} }