mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/storage: consistently check for missing metricID index records (#6967)
* Previously, only metricID->metricName missing index records were tracked with deadline But it was possible a case for missing metricID->TSID index records. IndexDB metrics fix exposed misleading metric for such missing records. * This commit adds check for metricID->TSID missing index records. And delete missing metricID entry if it hit 60 second deadline. Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931 Signed-off-by: f41gh7 <nik@victoriametrics.com>
This commit is contained in:
parent
264c2ec6bd
commit
d8f8822fa5
3 changed files with 58 additions and 35 deletions
|
@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
||||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details.
|
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fixes start-up crash on Windows OS. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6973) for details.
|
||||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted.
|
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric `vm_object_references{type="indexdb"}`. Previously, it was overcounted.
|
||||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details.
|
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly ingest stale NaN samples. Previously it could be dropped if series didn't exist at storage node. See this issue [https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069] for details.
|
||||||
|
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly track `vm_missing_tsids_for_metric_id_total` metric. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6931) for details.
|
||||||
|
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file).
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): do not send notifications without labels to Alertmanager. Such notifications are rejected by Alertmanager anyway. Before, vmalert could send alert notifications even if no label-value pairs left after applying `alert_relabel_configs` from [notifier config](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file).
|
||||||
|
|
||||||
|
|
|
@ -1518,36 +1518,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt
|
||||||
return dst, true
|
return dst, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cannot find the MetricName for the given metricID.
|
if db.s.shouldDeleteMissingMetricID(metricID) {
|
||||||
// There are the following expected cases when this may happen:
|
|
||||||
//
|
|
||||||
// 1. The corresponding metricID -> metricName entry isn't visible for search yet.
|
|
||||||
// The solution is to wait for some time and try the search again.
|
|
||||||
// It is OK if newly registered time series isn't visible for search during some time.
|
|
||||||
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
|
||||||
//
|
|
||||||
// 2. The metricID -> metricName entry doesn't exist in the indexdb.
|
|
||||||
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
|
|
||||||
// In this case the metricID must be deleted, so new metricID is registered
|
|
||||||
// again when new sample for the given metricName is ingested next time.
|
|
||||||
//
|
|
||||||
ct := fasttime.UnixTimestamp()
|
|
||||||
db.s.missingMetricIDsLock.Lock()
|
|
||||||
if ct > db.s.missingMetricIDsResetDeadline {
|
|
||||||
db.s.missingMetricIDs = nil
|
|
||||||
db.s.missingMetricIDsResetDeadline = ct + 2*60
|
|
||||||
}
|
|
||||||
deleteDeadline, ok := db.s.missingMetricIDs[metricID]
|
|
||||||
if !ok {
|
|
||||||
if db.s.missingMetricIDs == nil {
|
|
||||||
db.s.missingMetricIDs = make(map[uint64]uint64)
|
|
||||||
}
|
|
||||||
deleteDeadline = ct + 60
|
|
||||||
db.s.missingMetricIDs[metricID] = deleteDeadline
|
|
||||||
}
|
|
||||||
db.s.missingMetricIDsLock.Unlock()
|
|
||||||
|
|
||||||
if ct > deleteDeadline {
|
|
||||||
// Cannot find the MetricName for the given metricID for the last 60 seconds.
|
// Cannot find the MetricName for the given metricID for the last 60 seconds.
|
||||||
// It is likely the indexDB contains incomplete set of metricID -> metricName entries
|
// It is likely the indexDB contains incomplete set of metricID -> metricName entries
|
||||||
// after unclean shutdown or after restoring from a snapshot.
|
// after unclean shutdown or after restoring from a snapshot.
|
||||||
|
@ -1810,6 +1781,7 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
||||||
tsidsFound := i
|
tsidsFound := i
|
||||||
qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs))
|
qt.Printf("found %d tsids for %d metricIDs in the current indexdb", tsidsFound, len(metricIDs))
|
||||||
|
|
||||||
|
var metricIDsToDelete []uint64
|
||||||
if len(extMetricIDs) > 0 {
|
if len(extMetricIDs) > 0 {
|
||||||
// Search for extMetricIDs in the previous indexdb (aka extDB)
|
// Search for extMetricIDs in the previous indexdb (aka extDB)
|
||||||
db.doExtDB(func(extDB *indexDB) {
|
db.doExtDB(func(extDB *indexDB) {
|
||||||
|
@ -1829,7 +1801,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
||||||
// This may be the case on incomplete indexDB
|
// This may be the case on incomplete indexDB
|
||||||
// due to snapshot or due to unflushed entries.
|
// due to snapshot or due to unflushed entries.
|
||||||
// Just increment errors counter and skip it for now.
|
// Just increment errors counter and skip it for now.
|
||||||
is.db.missingTSIDsForMetricID.Add(1)
|
if is.db.s.shouldDeleteMissingMetricID(metricID) {
|
||||||
|
is.db.missingTSIDsForMetricID.Add(1)
|
||||||
|
metricIDsToDelete = append(metricIDsToDelete, metricID)
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
is.db.putToMetricIDCache(metricID, tsid)
|
is.db.putToMetricIDCache(metricID, tsid)
|
||||||
|
@ -1845,6 +1820,10 @@ func (db *indexDB) getTSIDsFromMetricIDs(qt *querytracer.Tracer, metricIDs []uin
|
||||||
tsids = tsids[:i]
|
tsids = tsids[:i]
|
||||||
qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs))
|
qt.Printf("load %d tsids for %d metricIDs from both current and previous indexdb", len(tsids), len(metricIDs))
|
||||||
|
|
||||||
|
if len(metricIDsToDelete) > 0 {
|
||||||
|
db.deleteMetricIDs(metricIDsToDelete)
|
||||||
|
}
|
||||||
|
|
||||||
// Sort the found tsids, since they must be passed to TSID search
|
// Sort the found tsids, since they must be passed to TSID search
|
||||||
// in the sorted order.
|
// in the sorted order.
|
||||||
sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) })
|
sort.Slice(tsids, func(i, j int) bool { return tsids[i].Less(&tsids[j]) })
|
||||||
|
@ -3313,8 +3292,10 @@ func mergeTagToMetricIDsRowsInternal(data []byte, items []mergeset.Item, nsPrefi
|
||||||
return dstData, dstItems
|
return dstData, dstItems
|
||||||
}
|
}
|
||||||
|
|
||||||
var indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
|
var (
|
||||||
var indexBlocksWithMetricIDsProcessed atomic.Uint64
|
indexBlocksWithMetricIDsIncorrectOrder atomic.Uint64
|
||||||
|
indexBlocksWithMetricIDsProcessed atomic.Uint64
|
||||||
|
)
|
||||||
|
|
||||||
func checkItemsSorted(data []byte, items []mergeset.Item) bool {
|
func checkItemsSorted(data []byte, items []mergeset.Item) bool {
|
||||||
if len(items) == 0 {
|
if len(items) == 0 {
|
||||||
|
|
|
@ -154,8 +154,9 @@ type Storage struct {
|
||||||
|
|
||||||
// missingMetricIDs maps metricID to the deadline in unix timestamp seconds
|
// missingMetricIDs maps metricID to the deadline in unix timestamp seconds
|
||||||
// after which all the indexdb entries for the given metricID
|
// after which all the indexdb entries for the given metricID
|
||||||
// must be deleted if metricName isn't found by the given metricID.
|
// must be deleted if index entry isn't found by the given metricID.
|
||||||
// This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries.
|
// This is used inside searchMetricNameWithCache() and getTSIDsFromMetricIDs()
|
||||||
|
// for detecting permanently missing metricID->metricName/TSID entries.
|
||||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
||||||
missingMetricIDsLock sync.Mutex
|
missingMetricIDsLock sync.Mutex
|
||||||
missingMetricIDs map[uint64]uint64
|
missingMetricIDs map[uint64]uint64
|
||||||
|
@ -2709,3 +2710,43 @@ var indexDBTableIdx = func() *atomic.Uint64 {
|
||||||
x.Store(uint64(time.Now().UnixNano()))
|
x.Store(uint64(time.Now().UnixNano()))
|
||||||
return &x
|
return &x
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// shouldDeleteMissingMetricID checks if metricID index entry is missing
|
||||||
|
//
|
||||||
|
// Broken index entry should be deleted by caller
|
||||||
|
// There are the following expected cases when this may happen:
|
||||||
|
//
|
||||||
|
// 1. The corresponding metricID -> metricName/tsid entry isn't visible for search yet.
|
||||||
|
// The solution is to wait for some time and try the search again.
|
||||||
|
// It is OK if newly registered time series isn't visible for search during some time.
|
||||||
|
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
|
||||||
|
//
|
||||||
|
// 2. The metricID -> metricName/tsid entry doesn't exist in the indexdb.
|
||||||
|
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
|
||||||
|
// In this case the metricID must be deleted, so new metricID is registered
|
||||||
|
// again when new sample for the given metric is ingested next time.
|
||||||
|
func (s *Storage) shouldDeleteMissingMetricID(metricID uint64) bool {
|
||||||
|
ct := fasttime.UnixTimestamp()
|
||||||
|
s.missingMetricIDsLock.Lock()
|
||||||
|
defer s.missingMetricIDsLock.Unlock()
|
||||||
|
|
||||||
|
if ct > s.missingMetricIDsResetDeadline {
|
||||||
|
s.missingMetricIDs = nil
|
||||||
|
s.missingMetricIDsResetDeadline = ct + 2*60
|
||||||
|
}
|
||||||
|
deleteDeadline, ok := s.missingMetricIDs[metricID]
|
||||||
|
if !ok {
|
||||||
|
if s.missingMetricIDs == nil {
|
||||||
|
s.missingMetricIDs = make(map[uint64]uint64)
|
||||||
|
}
|
||||||
|
deleteDeadline = ct + 60
|
||||||
|
s.missingMetricIDs[metricID] = deleteDeadline
|
||||||
|
}
|
||||||
|
// Cannot find index entry for the given metricID for the last 60 seconds.
|
||||||
|
// It is likely the indexDB contains incomplete set of metricID -> metricName/tsid entries
|
||||||
|
// after unclean shutdown or after restoring from a snapshot.
|
||||||
|
// Mark the metricID as deleted, so it is created again when new sample
|
||||||
|
// for the given time series is ingested next time.
|
||||||
|
|
||||||
|
return ct > deleteDeadline
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue