lib/storage: consistently check for missing metricID index records (#6967)

* Previously, only metricID->metricName missing index records were
tracked with deadline But it was possible a case for missing
metricID->TSID index records. IndexDB metrics fix exposed misleading
metric for such missing records.

* This commit adds check for metricID->TSID missing index records. And
delete missing metricID entry if it hit 60 second deadline.

Related issue
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931

Signed-off-by: f41gh7 <nik@victoriametrics.com>
This commit is contained in:
Nikolay 2024-09-16 10:05:08 +02:00 committed by GitHub
parent 264c2ec6bd
commit d8f8822fa5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 58 additions and 35 deletions

View file

@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details.
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted.
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details.
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly track `vm_missing_tsids_for_metric_id_total` metric. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931) for details.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file).

View file

@ -1518,36 +1518,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt
return dst, true
}
// Cannot find the MetricName for the given metricID.
// There are the following expected cases when this may happen:
//
// 1. The corresponding metricID -> metricName entry isn't visible for search yet.
// The solution is to wait for some time and try the search again.
// It is OK if newly registered time series isn't visible for search during some time.
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
//
// 2. The metricID -> metricName entry doesn't exist in the indexdb.
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
// In this case the metricID must be deleted, so new metricID is registered
// again when new sample for the given metricName is ingested next time.
//
ct := fasttime.UnixTimestamp()
db.s.missingMetricIDsLock.Lock()
if ct > db.s.missingMetricIDsResetDeadline {
db.s.missingMetricIDs = nil
db.s.missingMetricIDsResetDeadline = ct + 2*60
}
deleteDeadline, ok := db.s.missingMetricIDs[metricID]
if !ok {
if db.s.missingMetricIDs == nil {
db.s.missingMetricIDs = make(map[uint64]uint64)
}
deleteDeadline = ct + 60
db.s.missingMetricIDs[metricID] = deleteDeadline
}
db.s.missingMetricIDsLock.Unlock()
if ct > deleteDeadline {
if db.s.shouldDeleteMissingMetricID(metricID) {
// Cannot find the MetricName for the given metricID for the last 60 seconds.
// It is likely the indexDB contains incomplete set of metricID -> metricName entries
// after unclean shutdown or after restoring from a snapshot.
@ -1810,6 +1781,7 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
tsidsFound := i
qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs))
var metricIDsToDelete []uint64
if len(extMetricIDs) > 0 {
// Search for extMetricIDs in the previous indexdb (aka extDB)
db.doExtDB(func(extDB *indexDB) {
@ -1829,7 +1801,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
// This may be the case on incomplete indexDB
// due to snapshot or due to unflushed entries.
// Just increment errors counter and skip it for now.
is.db.missingTSIDsForMetricID.Add(1)
if is.db.s.shouldDeleteMissingMetricID(metricID) {
is.db.missingTSIDsForMetricID.Add(1)
metricIDsToDelete = append(metricIDsToDelete, metricID)
}
continue
}
is.db.putToMetricIDCache(metricID, tsid)
@ -1845,6 +1820,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
tsids = tsids[:i]
qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs))
if len(metricIDsToDelete) > 0 {
db.deleteMetricIDs(metricIDsToDelete)
}
// Sort the found tsids, since they must be passed to TSID search
// in the sorted order.
sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) })
@ -3313,8 +3292,10 @@ func mergeTagToMetricIDsRowsInternal(data []byte, items []mergeset.Item, nsPrefi
return dstData, dstItems
}
var indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
var indexBlocksWithMetricIDsProcessed atomic.Uint64
var (
indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
indexBlocksWithMetricIDsProcessed atomic.Uint64
)
func checkItemsSorted(data []byte, items []mergeset.Item) bool {
if len(items) == 0 {

View file

@ -154,8 +154,9 @@ type Storage struct {
// missingMetricIDs maps metricID to the deadline in unix timestamp seconds
// after which all the indexdb entries for the given metricID
// must be deleted if metricName isn't found by the given metricID.
// This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries.
// must be deleted if index entry isn't found by the given metricID.
// This is used inside searchMetricNameWithCache() and getTSIDsFromMetricIDs()
// for detecting permanently missing metricID->metricName/TSID entries.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
missingMetricIDsLock sync.Mutex
missingMetricIDs map[uint64]uint64
@ -2709,3 +2710,43 @@ var indexDBTableIdx = func() *atomic.Uint64 {
x.Store(uint64(time.Now().UnixNano()))
return &x
}()
// shouldDeleteMissingMetricID checks if metricID index entry is missing
//
// Broken index entry should be deleted by caller
// There are the following expected cases when this may happen:
//
// 1. The corresponding metricID -> metricName/tsid entry isn't visible for search yet.
// The solution is to wait for some time and try the search again.
// It is OK if newly registered time series isn't visible for search during some time.
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
//
// 2. The metricID -> metricName/tsid entry doesn't exist in the indexdb.
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
// In this case the metricID must be deleted, so new metricID is registered
// again when new sample for the given metric is ingested next time.
func (s *Storage) shouldDeleteMissingMetricID(metricID uint64) bool {
ct := fasttime.UnixTimestamp()
s.missingMetricIDsLock.Lock()
defer s.missingMetricIDsLock.Unlock()
if ct > s.missingMetricIDsResetDeadline {
s.missingMetricIDs = nil
s.missingMetricIDsResetDeadline = ct + 2*60
}
deleteDeadline, ok := s.missingMetricIDs[metricID]
if !ok {
if s.missingMetricIDs == nil {
s.missingMetricIDs = make(map[uint64]uint64)
}
deleteDeadline = ct + 60
s.missingMetricIDs[metricID] = deleteDeadline
}
// Cannot find index entry for the given metricID for the last 60 seconds.
// It is likely the indexDB contains incomplete set of metricID -> metricName/tsid entries
// after unclean shutdown or after restoring from a snapshot.
// Mark the metricID as deleted, so it is created again when new sample
// for the given time series is ingested next time.
return ct > deleteDeadline
}