From 76f00cea6b9bf02a262275a112040c2680ed472c Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 18 Mar 2024 00:19:17 +0200 Subject: [PATCH] lib/storage: wait for up to 60 seconds before deciding to delete metricID entries from indexdb if metricID->metricName entry is missing during search The metricID->metricName entry can remain invisible for search for some time after registering new metricName. This is expected condition. So wait for up to 60 seconds in the hope that the metricID->metricName entry will become visible before deleting all the entries from indexdb, which are associated with the given metricID. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5948 See also 20812008a740e68f0e646e9b774e2d55030b736d --- docs/CHANGELOG.md | 1 + lib/storage/index_db.go | 45 ++++++++++++++++++++++++++++++++++------- lib/storage/storage.go | 7 +++++++ 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index eb16942386..46f699bdd7 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -55,6 +55,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): support client-side TLS configuration for [native protocol](https://docs.victoriametrics.com/vmctl/#migrating-data-from-victoriametrics). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5748). Thanks to @khushijain21 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5824). * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): support client-side TLS configuration for VictoriaMetrics destination specified via `--vm-*` cmd-line flags used in [InfluxDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-influxdb-1x), [Remote Read protocol](https://docs.victoriametrics.com/vmctl/#migrating-data-by-remote-read-protocol), [OpenTSDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-opentsdb), [Prometheus](https://docs.victoriametrics.com/vmctl/#migrating-data-from-prometheus) and [Promscale](https://docs.victoriametrics.com/vmctl/#migrating-data-from-promscale) migration modes. +* BUGFIX: prevent from automatic deletion of newly registered time series when it is queried immediately after the addition. The probability of this bug has been increased significantly after [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0) because of optimizations related to registering new time series. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5948) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959) issue. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly set `Host` header in requests to scrape targets if it is specified via [`headers` option](https://docs.victoriametrics.com/sd_configs/#http-api-client-options). Thanks to @fholzer for [the bugreport](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5969) and [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5970). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly set `Host` header in requests to scrape targets when [`server_name` option at `tls_config`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config) is set. Previously the `Host` header was set incorrectly to the target hostname in this case. * BUGFIX: do not drop `match[]` filter at [`/api/v1/series`](https://docs.victoriametrics.com/url-examples/#apiv1series) if `-search.ignoreExtraFiltersAtLabelsAPI` command-line flag is set, since missing `match[]` filter breaks `/api/v1/series` requests. diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index da5f956aca..26728b0395 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -1485,14 +1485,45 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt return dst, true } - // Cannot find MetricName for the given metricID. This may be the case - // when indexDB contains incomplete set of metricID -> metricName entries - // after a snapshot or due to unflushed entries. - db.missingMetricNamesForMetricID.Add(1) + // Cannot find the MetricName for the given metricID. + // There are the following expected cases when this may happen: + // + // 1. The corresponding metricID -> metricName entry isn't visible for search yet. + // The solution is to wait for some time and try the search again. + // It is OK if newly registered time series isn't visible for search during some time. + // This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959 + // + // 2. The metricID -> metricName entry doesn't exist in the indexdb. + // This is possible after unclean shutdown or after restoring of indexdb from a snapshot. + // In this case the metricID must be deleted, so new metricID is registered + // again when new sample for the given metricName is ingested next time. + // + ct := fasttime.UnixTimestamp() + db.s.missingMetricIDsLock.Lock() + if ct > db.s.missingMetricIDsResetDeadline { + db.s.missingMetricIDs = nil + db.s.missingMetricIDsResetDeadline = ct + 2*60 + } + timestamp, ok := db.s.missingMetricIDs[metricID] + if !ok { + if db.s.missingMetricIDs == nil { + db.s.missingMetricIDs = make(map[uint64]uint64) + } + db.s.missingMetricIDs[metricID] = ct + timestamp = ct + } + db.s.missingMetricIDsLock.Unlock() + + if ct > timestamp+60 { + // Cannot find the MetricName for the given metricID for the last 60 seconds. + // It is likely the indexDB contains incomplete set of metricID -> metricName entries + // after unclean shutdown or after restoring from a snapshot. + // Mark the metricID as deleted, so it is created again when new sample + // for the given time series is ingested next time. + db.missingMetricNamesForMetricID.Add(1) + db.deleteMetricIDs([]uint64{metricID}) + } - // Mark the metricID as deleted, so it will be created again when new data point - // for the given time series will arrive. - db.deleteMetricIDs([]uint64{metricID}) return dst, false } diff --git a/lib/storage/storage.go b/lib/storage/storage.go index dab0e72082..8be641051e 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -147,6 +147,13 @@ type Storage struct { deletedMetricIDs atomic.Pointer[uint64set.Set] deletedMetricIDsUpdateLock sync.Mutex + // missingMetricIDs maps metricID to the timestamp of first unsuccessful lookup + // of metricName by the given metricID. + // This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries. + missingMetricIDsLock sync.Mutex + missingMetricIDs map[uint64]uint64 + missingMetricIDsResetDeadline uint64 + // isReadOnly is set to true when the storage is in read-only mode. isReadOnly atomic.Bool }