From d8f8822fa5be0060551b9e644165066b2797ce16 Mon Sep 17 00:00:00 2001
From: Nikolay <nik@victoriametrics.com>
Date: Mon, 16 Sep 2024 10:05:08 +0200
Subject: [PATCH] lib/storage: consistently check for missing metricID index
 records (#6967)

* Previously, only metricID->metricName missing index records were
tracked with deadline But it was possible a case for missing
metricID->TSID index records. IndexDB metrics fix exposed misleading
metric for such missing records.

* This commit adds check for metricID->TSID missing index records. And
delete missing metricID entry if it hit 60 second deadline.

Related issue
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931

Signed-off-by: f41gh7 <nik@victoriametrics.com>
---
 docs/changelog/CHANGELOG.md |  1 +
 lib/storage/index_db.go     | 47 +++++++++++--------------------------
 lib/storage/storage.go      | 45 +++++++++++++++++++++++++++++++++--
 3 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md
index 6e977856b..54b1769cc 100644
--- a/docs/changelog/CHANGELOG.md
+++ b/docs/changelog/CHANGELOG.md
@@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details.
 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted.
 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details.
+* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly track `vm_missing_tsids_for_metric_id_total` metric. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931) for details.
 
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file).
 
diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go
index d9a1939db..2c37c2fd3 100644
--- a/lib/storage/index_db.go
+++ b/lib/storage/index_db.go
@@ -1518,36 +1518,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt
 		return dst, true
 	}
 
-	// Cannot find the MetricName for the given metricID.
-	// There are the following expected cases when this may happen:
-	//
-	// 1. The corresponding metricID -> metricName entry isn't visible for search yet.
-	//    The solution is to wait for some time and try the search again.
-	//    It is OK if newly registered time series isn't visible for search during some time.
-	//    This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
-	//
-	// 2. The metricID -> metricName entry doesn't exist in the indexdb.
-	//    This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
-	//    In this case the metricID must be deleted, so new metricID is registered
-	//    again when new sample for the given metricName is ingested next time.
-	//
-	ct := fasttime.UnixTimestamp()
-	db.s.missingMetricIDsLock.Lock()
-	if ct > db.s.missingMetricIDsResetDeadline {
-		db.s.missingMetricIDs = nil
-		db.s.missingMetricIDsResetDeadline = ct + 2*60
-	}
-	deleteDeadline, ok := db.s.missingMetricIDs[metricID]
-	if !ok {
-		if db.s.missingMetricIDs == nil {
-			db.s.missingMetricIDs = make(map[uint64]uint64)
-		}
-		deleteDeadline = ct + 60
-		db.s.missingMetricIDs[metricID] = deleteDeadline
-	}
-	db.s.missingMetricIDsLock.Unlock()
-
-	if ct > deleteDeadline {
+	if db.s.shouldDeleteMissingMetricID(metricID) {
 		// Cannot find the MetricName for the given metricID for the last 60 seconds.
 		// It is likely the indexDB contains incomplete set of metricID -> metricName entries
 		// after unclean shutdown or after restoring from a snapshot.
@@ -1810,6 +1781,7 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
 	tsidsFound := i
 	qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs))
 
+	var metricIDsToDelete []uint64
 	if len(extMetricIDs) > 0 {
 		// Search for extMetricIDs in the previous indexdb (aka extDB)
 		db.doExtDB(func(extDB *indexDB) {
@@ -1829,7 +1801,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
 					// This may be the case on incomplete indexDB
 					// due to snapshot or due to unflushed entries.
 					// Just increment errors counter and skip it for now.
-					is.db.missingTSIDsForMetricID.Add(1)
+					if is.db.s.shouldDeleteMissingMetricID(metricID) {
+						is.db.missingTSIDsForMetricID.Add(1)
+						metricIDsToDelete = append(metricIDsToDelete, metricID)
+					}
 					continue
 				}
 				is.db.putToMetricIDCache(metricID, tsid)
@@ -1845,6 +1820,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
 	tsids = tsids[:i]
 	qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs))
 
+	if len(metricIDsToDelete) > 0 {
+		db.deleteMetricIDs(metricIDsToDelete)
+	}
+
 	// Sort the found tsids, since they must be passed to TSID search
 	// in the sorted order.
 	sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) })
@@ -3313,8 +3292,10 @@ func mergeTagToMetricIDsRowsInternal(data []byte, items []mergeset.Item, nsPrefi
 	return dstData, dstItems
 }
 
-var indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
-var indexBlocksWithMetricIDsProcessed atomic.Uint64
+var (
+	indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
+	indexBlocksWithMetricIDsProcessed      atomic.Uint64
+)
 
 func checkItemsSorted(data []byte, items []mergeset.Item) bool {
 	if len(items) == 0 {
diff --git a/lib/storage/storage.go b/lib/storage/storage.go
index 2125ca247..71b91b1a5 100644
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@@ -154,8 +154,9 @@ type Storage struct {
 
 	// missingMetricIDs maps metricID to the deadline in unix timestamp seconds
 	// after which all the indexdb entries for the given metricID
-	// must be deleted if metricName isn't found by the given metricID.
-	// This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries.
+	// must be deleted if index entry isn't found by the given metricID.
+	// This is used inside searchMetricNameWithCache() and getTSIDsFromMetricIDs()
+	// for detecting permanently missing metricID->metricName/TSID entries.
 	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
 	missingMetricIDsLock          sync.Mutex
 	missingMetricIDs              map[uint64]uint64
@@ -2709,3 +2710,43 @@ var indexDBTableIdx = func() *atomic.Uint64 {
 	x.Store(uint64(time.Now().UnixNano()))
 	return &x
 }()
+
+// shouldDeleteMissingMetricID checks if metricID index entry is missing
+//
+// Broken index entry should be deleted by caller
+// There are the following expected cases when this may happen:
+//
+//  1. The corresponding metricID -> metricName/tsid entry isn't visible for search yet.
+//     The solution is to wait for some time and try the search again.
+//     It is OK if newly registered time series isn't visible for search during some time.
+//     This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
+//
+//  2. The metricID -> metricName/tsid entry doesn't exist in the indexdb.
+//     This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
+//     In this case the metricID must be deleted, so new metricID is registered
+//     again when new sample for the given metric is ingested next time.
+func (s *Storage) shouldDeleteMissingMetricID(metricID uint64) bool {
+	ct := fasttime.UnixTimestamp()
+	s.missingMetricIDsLock.Lock()
+	defer s.missingMetricIDsLock.Unlock()
+
+	if ct > s.missingMetricIDsResetDeadline {
+		s.missingMetricIDs = nil
+		s.missingMetricIDsResetDeadline = ct + 2*60
+	}
+	deleteDeadline, ok := s.missingMetricIDs[metricID]
+	if !ok {
+		if s.missingMetricIDs == nil {
+			s.missingMetricIDs = make(map[uint64]uint64)
+		}
+		deleteDeadline = ct + 60
+		s.missingMetricIDs[metricID] = deleteDeadline
+	}
+	// Cannot find index entry for the given metricID for the last 60 seconds.
+	// It is likely the indexDB contains incomplete set of metricID -> metricName/tsid entries
+	// after unclean shutdown or after restoring from a snapshot.
+	// Mark the metricID as deleted, so it is created again when new sample
+	// for the given time series is ingested next time.
+
+	return ct > deleteDeadline
+}