mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/storage: remove metricID with missing metricID->metricName entry
The metricID->metricName entry can be missing in the indexdb after unclean shutdown when only a part of entries for new time series is written into indexdb. Recover from such a situation by removing the broken metricID. New metricID will be automatically created for time series with the given metricName when new data point will arive to it.
This commit is contained in:
parent
62a915f2b2
commit
20812008a7
4 changed files with 45 additions and 29 deletions
|
@ -462,16 +462,12 @@ func getStorageSearch() *storage.Search {
|
||||||
}
|
}
|
||||||
|
|
||||||
func putStorageSearch(sr *storage.Search) {
|
func putStorageSearch(sr *storage.Search) {
|
||||||
n := atomic.LoadUint64(&sr.MissingMetricNamesForMetricID)
|
|
||||||
missingMetricNamesForMetricID.Add(int(n))
|
|
||||||
sr.MustClose()
|
sr.MustClose()
|
||||||
ssPool.Put(sr)
|
ssPool.Put(sr)
|
||||||
}
|
}
|
||||||
|
|
||||||
var ssPool sync.Pool
|
var ssPool sync.Pool
|
||||||
|
|
||||||
var missingMetricNamesForMetricID = metrics.NewCounter(`vm_missing_metric_names_for_metric_id_total`)
|
|
||||||
|
|
||||||
// ProcessSearchQuery performs sq on storage nodes until the given deadline.
|
// ProcessSearchQuery performs sq on storage nodes until the given deadline.
|
||||||
func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadline) (*Results, error) {
|
func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadline) (*Results, error) {
|
||||||
// Setup search.
|
// Setup search.
|
||||||
|
|
|
@ -414,6 +414,10 @@ func registerStorageMetrics() {
|
||||||
return float64(idbm().DateRangeSearchHits)
|
return float64(idbm().DateRangeSearchHits)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
metrics.NewGauge(`vm_missing_metric_names_for_metric_id_total`, func() float64 {
|
||||||
|
return float64(idbm().MissingMetricNamesForMetricID)
|
||||||
|
})
|
||||||
|
|
||||||
metrics.NewGauge(`vm_date_metric_id_cache_syncs_total`, func() float64 {
|
metrics.NewGauge(`vm_date_metric_id_cache_syncs_total`, func() float64 {
|
||||||
return float64(m().DateMetricIDCacheSyncsCount)
|
return float64(m().DateMetricIDCacheSyncsCount)
|
||||||
})
|
})
|
||||||
|
|
|
@ -101,6 +101,11 @@ type indexDB struct {
|
||||||
// The number of hits for date range searches.
|
// The number of hits for date range searches.
|
||||||
dateRangeSearchHits uint64
|
dateRangeSearchHits uint64
|
||||||
|
|
||||||
|
// missingMetricNamesForMetricID is a counter of missing MetricID -> MetricName entries.
|
||||||
|
// High rate may mean corrupted indexDB due to unclean shutdown.
|
||||||
|
// The db must be automatically recovered after that.
|
||||||
|
missingMetricNamesForMetricID uint64
|
||||||
|
|
||||||
mustDrop uint64
|
mustDrop uint64
|
||||||
|
|
||||||
// Start date fully covered by per-day inverted index.
|
// Start date fully covered by per-day inverted index.
|
||||||
|
@ -228,6 +233,8 @@ type IndexDBMetrics struct {
|
||||||
DateRangeSearchCalls uint64
|
DateRangeSearchCalls uint64
|
||||||
DateRangeSearchHits uint64
|
DateRangeSearchHits uint64
|
||||||
|
|
||||||
|
MissingMetricNamesForMetricID uint64
|
||||||
|
|
||||||
IndexBlocksWithMetricIDsProcessed uint64
|
IndexBlocksWithMetricIDsProcessed uint64
|
||||||
IndexBlocksWithMetricIDsIncorrectOrder uint64
|
IndexBlocksWithMetricIDsIncorrectOrder uint64
|
||||||
|
|
||||||
|
@ -269,6 +276,8 @@ func (db *indexDB) UpdateMetrics(m *IndexDBMetrics) {
|
||||||
m.DateRangeSearchCalls += atomic.LoadUint64(&db.dateRangeSearchCalls)
|
m.DateRangeSearchCalls += atomic.LoadUint64(&db.dateRangeSearchCalls)
|
||||||
m.DateRangeSearchHits += atomic.LoadUint64(&db.dateRangeSearchHits)
|
m.DateRangeSearchHits += atomic.LoadUint64(&db.dateRangeSearchHits)
|
||||||
|
|
||||||
|
m.MissingMetricNamesForMetricID += atomic.LoadUint64(&db.missingMetricNamesForMetricID)
|
||||||
|
|
||||||
m.IndexBlocksWithMetricIDsProcessed = atomic.LoadUint64(&indexBlocksWithMetricIDsProcessed)
|
m.IndexBlocksWithMetricIDsProcessed = atomic.LoadUint64(&indexBlocksWithMetricIDsProcessed)
|
||||||
m.IndexBlocksWithMetricIDsIncorrectOrder = atomic.LoadUint64(&indexBlocksWithMetricIDsIncorrectOrder)
|
m.IndexBlocksWithMetricIDsIncorrectOrder = atomic.LoadUint64(&indexBlocksWithMetricIDsIncorrectOrder)
|
||||||
|
|
||||||
|
@ -888,6 +897,13 @@ func (db *indexDB) searchMetricName(dst []byte, metricID uint64) ([]byte, error)
|
||||||
// Cannot find MetricName for the given metricID. This may be the case
|
// Cannot find MetricName for the given metricID. This may be the case
|
||||||
// when indexDB contains incomplete set of metricID -> metricName entries
|
// when indexDB contains incomplete set of metricID -> metricName entries
|
||||||
// after a snapshot or due to unflushed entries.
|
// after a snapshot or due to unflushed entries.
|
||||||
|
atomic.AddUint64(&db.missingMetricNamesForMetricID, 1)
|
||||||
|
|
||||||
|
// Mark the metricID as deleted, so it will be created again when new data point
|
||||||
|
// for the given time series will arrive.
|
||||||
|
if err := db.deleteMetricIDs([]uint64{metricID}); err != nil {
|
||||||
|
return dst, fmt.Errorf("cannot delete metricID for missing metricID->metricName entry; metricID=%d; error: %s", metricID, err)
|
||||||
|
}
|
||||||
return dst, io.EOF
|
return dst, io.EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -912,9 +928,28 @@ func (db *indexDB) DeleteTSIDs(tfss []*TagFilters) (int, error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
if err := db.deleteMetricIDs(metricIDs); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete TSIDs in the extDB.
|
||||||
|
deletedCount := len(metricIDs)
|
||||||
|
if db.doExtDB(func(extDB *indexDB) {
|
||||||
|
var n int
|
||||||
|
n, err = extDB.DeleteTSIDs(tfss)
|
||||||
|
deletedCount += n
|
||||||
|
}) {
|
||||||
|
if err != nil {
|
||||||
|
return deletedCount, fmt.Errorf("cannot delete tsids in extDB: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return deletedCount, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *indexDB) deleteMetricIDs(metricIDs []uint64) error {
|
||||||
if len(metricIDs) == 0 {
|
if len(metricIDs) == 0 {
|
||||||
// Nothing to delete
|
// Nothing to delete
|
||||||
return 0, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark the found metricIDs as deleted.
|
// Mark the found metricIDs as deleted.
|
||||||
|
@ -924,12 +959,11 @@ func (db *indexDB) DeleteTSIDs(tfss []*TagFilters) (int, error) {
|
||||||
items.B = encoding.MarshalUint64(items.B, metricID)
|
items.B = encoding.MarshalUint64(items.B, metricID)
|
||||||
items.Next()
|
items.Next()
|
||||||
}
|
}
|
||||||
err = db.tb.AddItems(items.Items)
|
err := db.tb.AddItems(items.Items)
|
||||||
putIndexItems(items)
|
putIndexItems(items)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return err
|
||||||
}
|
}
|
||||||
deletedCount := len(metricIDs)
|
|
||||||
|
|
||||||
// atomically add deleted metricIDs to an inmemory map.
|
// atomically add deleted metricIDs to an inmemory map.
|
||||||
dmis := &uint64set.Set{}
|
dmis := &uint64set.Set{}
|
||||||
|
@ -943,18 +977,7 @@ func (db *indexDB) DeleteTSIDs(tfss []*TagFilters) (int, error) {
|
||||||
|
|
||||||
// Do not reset uselessTagFiltersCache, since the found metricIDs
|
// Do not reset uselessTagFiltersCache, since the found metricIDs
|
||||||
// on cache miss are filtered out later with deletedMetricIDs.
|
// on cache miss are filtered out later with deletedMetricIDs.
|
||||||
|
return nil
|
||||||
// Delete TSIDs in the extDB.
|
|
||||||
if db.doExtDB(func(extDB *indexDB) {
|
|
||||||
var n int
|
|
||||||
n, err = extDB.DeleteTSIDs(tfss)
|
|
||||||
deletedCount += n
|
|
||||||
}) {
|
|
||||||
if err != nil {
|
|
||||||
return deletedCount, fmt.Errorf("cannot delete tsids in extDB: %s", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return deletedCount, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (db *indexDB) getDeletedMetricIDs() *uint64set.Set {
|
func (db *indexDB) getDeletedMetricIDs() *uint64set.Set {
|
||||||
|
|
|
@ -3,7 +3,6 @@ package storage
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"sync/atomic"
|
|
||||||
|
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
||||||
|
@ -89,11 +88,6 @@ type Search struct {
|
||||||
err error
|
err error
|
||||||
|
|
||||||
needClosing bool
|
needClosing bool
|
||||||
|
|
||||||
// MissingMetricNamesForMetricID is a counter of missing MetricID -> MetricName
|
|
||||||
// entries during the search.
|
|
||||||
// High rate may mean corrupted indexDB.
|
|
||||||
MissingMetricNamesForMetricID uint64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Search) reset() {
|
func (s *Search) reset() {
|
||||||
|
@ -104,7 +98,6 @@ func (s *Search) reset() {
|
||||||
s.ts.reset()
|
s.ts.reset()
|
||||||
s.err = nil
|
s.err = nil
|
||||||
s.needClosing = false
|
s.needClosing = false
|
||||||
s.MissingMetricNamesForMetricID = 0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init initializes s from the given storage, tfss and tr.
|
// Init initializes s from the given storage, tfss and tr.
|
||||||
|
@ -161,8 +154,8 @@ func (s *Search) NextMetricBlock() bool {
|
||||||
s.MetricBlock.MetricName, err = s.storage.searchMetricName(s.MetricBlock.MetricName[:0], tsid.MetricID)
|
s.MetricBlock.MetricName, err = s.storage.searchMetricName(s.MetricBlock.MetricName[:0], tsid.MetricID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
// Missing metricName for tsid.MetricID. Increment error counter and skip it.
|
// Skip missing metricName for tsid.MetricID.
|
||||||
atomic.AddUint64(&s.MissingMetricNamesForMetricID, 1)
|
// It should be automatically fixed. See indexDB.searchMetricName for details.
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
s.err = err
|
s.err = err
|
||||||
|
|
Loading…
Reference in a new issue