mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/storage: consistently check for missing metricID index records (#6967)
* Previously, only metricID->metricName missing index records were tracked with deadline But it was possible a case for missing metricID->TSID index records. IndexDB metrics fix exposed misleading metric for such missing records. * This commit adds check for metricID->TSID missing index records. And delete missing metricID entry if it hit 60 second deadline. Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931 Signed-off-by: f41gh7 <nik@victoriametrics.com>
This commit is contained in:
parent
264c2ec6bd
commit
d8f8822fa5
3 changed files with 58 additions and 35 deletions
|
@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
|||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details.
|
||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted.
|
||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details.
|
||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly track `vm_missing_tsids_for_metric_id_total` metric. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931) for details.
|
||||
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file).
|
||||
|
||||
|
|
|
@ -1518,36 +1518,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt
|
|||
return dst, true
|
||||
}
|
||||
|
||||
// Cannot find the MetricName for the given metricID.
|
||||
// There are the following expected cases when this may happen:
|
||||
//
|
||||
// 1. The corresponding metricID -> metricName entry isn't visible for search yet.
|
||||
// The solution is to wait for some time and try the search again.
|
||||
// It is OK if newly registered time series isn't visible for search during some time.
|
||||
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
||||
//
|
||||
// 2. The metricID -> metricName entry doesn't exist in the indexdb.
|
||||
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
|
||||
// In this case the metricID must be deleted, so new metricID is registered
|
||||
// again when new sample for the given metricName is ingested next time.
|
||||
//
|
||||
ct := fasttime.UnixTimestamp()
|
||||
db.s.missingMetricIDsLock.Lock()
|
||||
if ct > db.s.missingMetricIDsResetDeadline {
|
||||
db.s.missingMetricIDs = nil
|
||||
db.s.missingMetricIDsResetDeadline = ct + 2*60
|
||||
}
|
||||
deleteDeadline, ok := db.s.missingMetricIDs[metricID]
|
||||
if !ok {
|
||||
if db.s.missingMetricIDs == nil {
|
||||
db.s.missingMetricIDs = make(map[uint64]uint64)
|
||||
}
|
||||
deleteDeadline = ct + 60
|
||||
db.s.missingMetricIDs[metricID] = deleteDeadline
|
||||
}
|
||||
db.s.missingMetricIDsLock.Unlock()
|
||||
|
||||
if ct > deleteDeadline {
|
||||
if db.s.shouldDeleteMissingMetricID(metricID) {
|
||||
// Cannot find the MetricName for the given metricID for the last 60 seconds.
|
||||
// It is likely the indexDB contains incomplete set of metricID -> metricName entries
|
||||
// after unclean shutdown or after restoring from a snapshot.
|
||||
|
@ -1810,6 +1781,7 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
|||
tsidsFound := i
|
||||
qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs))
|
||||
|
||||
var metricIDsToDelete []uint64
|
||||
if len(extMetricIDs) > 0 {
|
||||
// Search for extMetricIDs in the previous indexdb (aka extDB)
|
||||
db.doExtDB(func(extDB *indexDB) {
|
||||
|
@ -1829,7 +1801,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
|||
// This may be the case on incomplete indexDB
|
||||
// due to snapshot or due to unflushed entries.
|
||||
// Just increment errors counter and skip it for now.
|
||||
if is.db.s.shouldDeleteMissingMetricID(metricID) {
|
||||
is.db.missingTSIDsForMetricID.Add(1)
|
||||
metricIDsToDelete = append(metricIDsToDelete, metricID)
|
||||
}
|
||||
continue
|
||||
}
|
||||
is.db.putToMetricIDCache(metricID, tsid)
|
||||
|
@ -1845,6 +1820,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
|||
tsids = tsids[:i]
|
||||
qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs))
|
||||
|
||||
if len(metricIDsToDelete) > 0 {
|
||||
db.deleteMetricIDs(metricIDsToDelete)
|
||||
}
|
||||
|
||||
// Sort the found tsids, since they must be passed to TSID search
|
||||
// in the sorted order.
|
||||
sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) })
|
||||
|
@ -3313,8 +3292,10 @@ func mergeTagToMetricIDsRowsInternal(data []byte, items []mergeset.Item, nsPrefi
|
|||
return dstData, dstItems
|
||||
}
|
||||
|
||||
var indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
|
||||
var indexBlocksWithMetricIDsProcessed atomic.Uint64
|
||||
var (
|
||||
indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
|
||||
indexBlocksWithMetricIDsProcessed atomic.Uint64
|
||||
)
|
||||
|
||||
func checkItemsSorted(data []byte, items []mergeset.Item) bool {
|
||||
if len(items) == 0 {
|
||||
|
|
|
@ -154,8 +154,9 @@ type Storage struct {
|
|||
|
||||
// missingMetricIDs maps metricID to the deadline in unix timestamp seconds
|
||||
// after which all the indexdb entries for the given metricID
|
||||
// must be deleted if metricName isn't found by the given metricID.
|
||||
// This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries.
|
||||
// must be deleted if index entry isn't found by the given metricID.
|
||||
// This is used inside searchMetricNameWithCache() and getTSIDsFromMetricIDs()
|
||||
// for detecting permanently missing metricID->metricName/TSID entries.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
||||
missingMetricIDsLock sync.Mutex
|
||||
missingMetricIDs map[uint64]uint64
|
||||
|
@ -2709,3 +2710,43 @@ var indexDBTableIdx = func() *atomic.Uint64 {
|
|||
x.Store(uint64(time.Now().UnixNano()))
|
||||
return &x
|
||||
}()
|
||||
|
||||
// shouldDeleteMissingMetricID checks if metricID index entry is missing
|
||||
//
|
||||
// Broken index entry should be deleted by caller
|
||||
// There are the following expected cases when this may happen:
|
||||
//
|
||||
// 1. The corresponding metricID -> metricName/tsid entry isn't visible for search yet.
|
||||
// The solution is to wait for some time and try the search again.
|
||||
// It is OK if newly registered time series isn't visible for search during some time.
|
||||
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
||||
//
|
||||
// 2. The metricID -> metricName/tsid entry doesn't exist in the indexdb.
|
||||
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
|
||||
// In this case the metricID must be deleted, so new metricID is registered
|
||||
// again when new sample for the given metric is ingested next time.
|
||||
func (s *Storage) shouldDeleteMissingMetricID(metricID uint64) bool {
|
||||
ct := fasttime.UnixTimestamp()
|
||||
s.missingMetricIDsLock.Lock()
|
||||
defer s.missingMetricIDsLock.Unlock()
|
||||
|
||||
if ct > s.missingMetricIDsResetDeadline {
|
||||
s.missingMetricIDs = nil
|
||||
s.missingMetricIDsResetDeadline = ct + 2*60
|
||||
}
|
||||
deleteDeadline, ok := s.missingMetricIDs[metricID]
|
||||
if !ok {
|
||||
if s.missingMetricIDs == nil {
|
||||
s.missingMetricIDs = make(map[uint64]uint64)
|
||||
}
|
||||
deleteDeadline = ct + 60
|
||||
s.missingMetricIDs[metricID] = deleteDeadline
|
||||
}
|
||||
// Cannot find index entry for the given metricID for the last 60 seconds.
|
||||
// It is likely the indexDB contains incomplete set of metricID -> metricName/tsid entries
|
||||
// after unclean shutdown or after restoring from a snapshot.
|
||||
// Mark the metricID as deleted, so it is created again when new sample
|
||||
// for the given time series is ingested next time.
|
||||
|
||||
return ct > deleteDeadline
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue