From d8f8822fa5be0060551b9e644165066b2797ce16 Mon Sep 17 00:00:00 2001 From: Nikolay Date: Mon, 16 Sep 2024 10:05:08 +0200 Subject: [PATCH] lib/storage: consistently check for missing metricID index records (#6967) * Previously, only metricID->metricName missing index records were tracked with deadline But it was possible a case for missing metricID->TSID index records. IndexDB metrics fix exposed misleading metric for such missing records. * This commit adds check for metricID->TSID missing index records. And delete missing metricID entry if it hit 60 second deadline. Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931 Signed-off-by: f41gh7 --- docs/changelog/CHANGELOG.md | 1 + lib/storage/index_db.go | 47 +++++++++++-------------------------- lib/storage/storage.go | 45 +++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 35 deletions(-) diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index 6e977856b..54b1769cc 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details. * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted. * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details. +* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly track `vm_missing_tsids_for_metric_id_total` metric. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931) for details. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file). diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index d9a1939db..2c37c2fd3 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -1518,36 +1518,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt return dst, true } - // Cannot find the MetricName for the given metricID. - // There are the following expected cases when this may happen: - // - // 1. The corresponding metricID -> metricName entry isn't visible for search yet. - // The solution is to wait for some time and try the search again. - // It is OK if newly registered time series isn't visible for search during some time. - // This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959 - // - // 2. The metricID -> metricName entry doesn't exist in the indexdb. - // This is possible after unclean shutdown or after restoring of indexdb from a snapshot. - // In this case the metricID must be deleted, so new metricID is registered - // again when new sample for the given metricName is ingested next time. - // - ct := fasttime.UnixTimestamp() - db.s.missingMetricIDsLock.Lock() - if ct > db.s.missingMetricIDsResetDeadline { - db.s.missingMetricIDs = nil - db.s.missingMetricIDsResetDeadline = ct + 2*60 - } - deleteDeadline, ok := db.s.missingMetricIDs[metricID] - if !ok { - if db.s.missingMetricIDs == nil { - db.s.missingMetricIDs = make(map[uint64]uint64) - } - deleteDeadline = ct + 60 - db.s.missingMetricIDs[metricID] = deleteDeadline - } - db.s.missingMetricIDsLock.Unlock() - - if ct > deleteDeadline { + if db.s.shouldDeleteMissingMetricID(metricID) { // Cannot find the MetricName for the given metricID for the last 60 seconds. // It is likely the indexDB contains incomplete set of metricID -> metricName entries // after unclean shutdown or after restoring from a snapshot. @@ -1810,6 +1781,7 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin tsidsFound := i qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs)) + var metricIDsToDelete []uint64 if len(extMetricIDs) > 0 { // Search for extMetricIDs in the previous indexdb (aka extDB) db.doExtDB(func(extDB *indexDB) { @@ -1829,7 +1801,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin // This may be the case on incomplete indexDB // due to snapshot or due to unflushed entries. // Just increment errors counter and skip it for now. - is.db.missingTSIDsForMetricID.Add(1) + if is.db.s.shouldDeleteMissingMetricID(metricID) { + is.db.missingTSIDsForMetricID.Add(1) + metricIDsToDelete = append(metricIDsToDelete, metricID) + } continue } is.db.putToMetricIDCache(metricID, tsid) @@ -1845,6 +1820,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin tsids = tsids[:i] qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs)) + if len(metricIDsToDelete) > 0 { + db.deleteMetricIDs(metricIDsToDelete) + } + // Sort the found tsids, since they must be passed to TSID search // in the sorted order. sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) }) @@ -3313,8 +3292,10 @@ func mergeTagToMetricIDsRowsInternal(data []byte, items []mergeset.Item, nsPrefi return dstData, dstItems } -var indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64 -var indexBlocksWithMetricIDsProcessed atomic.Uint64 +var ( + indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64 + indexBlocksWithMetricIDsProcessed atomic.Uint64 +) func checkItemsSorted(data []byte, items []mergeset.Item) bool { if len(items) == 0 { diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 2125ca247..71b91b1a5 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -154,8 +154,9 @@ type Storage struct { // missingMetricIDs maps metricID to the deadline in unix timestamp seconds // after which all the indexdb entries for the given metricID - // must be deleted if metricName isn't found by the given metricID. - // This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries. + // must be deleted if index entry isn't found by the given metricID. + // This is used inside searchMetricNameWithCache() and getTSIDsFromMetricIDs() + // for detecting permanently missing metricID->metricName/TSID entries. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959 missingMetricIDsLock sync.Mutex missingMetricIDs map[uint64]uint64 @@ -2709,3 +2710,43 @@ var indexDBTableIdx = func() *atomic.Uint64 { x.Store(uint64(time.Now().UnixNano())) return &x }() + +// shouldDeleteMissingMetricID checks if metricID index entry is missing +// +// Broken index entry should be deleted by caller +// There are the following expected cases when this may happen: +// +// 1. The corresponding metricID -> metricName/tsid entry isn't visible for search yet. +// The solution is to wait for some time and try the search again. +// It is OK if newly registered time series isn't visible for search during some time. +// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959 +// +// 2. The metricID -> metricName/tsid entry doesn't exist in the indexdb. +// This is possible after unclean shutdown or after restoring of indexdb from a snapshot. +// In this case the metricID must be deleted, so new metricID is registered +// again when new sample for the given metric is ingested next time. +func (s *Storage) shouldDeleteMissingMetricID(metricID uint64) bool { + ct := fasttime.UnixTimestamp() + s.missingMetricIDsLock.Lock() + defer s.missingMetricIDsLock.Unlock() + + if ct > s.missingMetricIDsResetDeadline { + s.missingMetricIDs = nil + s.missingMetricIDsResetDeadline = ct + 2*60 + } + deleteDeadline, ok := s.missingMetricIDs[metricID] + if !ok { + if s.missingMetricIDs == nil { + s.missingMetricIDs = make(map[uint64]uint64) + } + deleteDeadline = ct + 60 + s.missingMetricIDs[metricID] = deleteDeadline + } + // Cannot find index entry for the given metricID for the last 60 seconds. + // It is likely the indexDB contains incomplete set of metricID -> metricName/tsid entries + // after unclean shutdown or after restoring from a snapshot. + // Mark the metricID as deleted, so it is created again when new sample + // for the given time series is ingested next time. + + return ct > deleteDeadline +}