From ab4ab5b921e3b4589b970d2fa77f6a9d871ae2e7 Mon Sep 17 00:00:00 2001
From: hagen1778 <roman@victoriametrics.com>
Date: Tue, 17 Oct 2023 15:45:14 +0200
Subject: [PATCH] lib/storage: follow-up after
 188cfe3a8500761d4fa27f1346c74a08263222e7

https://github.com/VictoriaMetrics/VictoriaMetrics/commit/188cfe3a8500761d4fa27f1346c74a08263222e7

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5159

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 docs/CHANGELOG.md           |  1 +
 lib/storage/storage.go      |  7 +--
 lib/storage/storage_test.go | 89 +++++++++++++++++++------------------
 3 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 311d81229b..9f3e8ae96d 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -14,6 +14,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly parse `ca`, `cert` and `key` options at `tls_config` section inside [http client settings](https://docs.victoriametrics.com/sd_configs.html#http-api-client-options). Previously string values couldn't be parsed for these options, since the parser was mistakenly expecting a list of `uint8` values instead.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly drop samples if `-streamAggr.dropInput` command-line flag is set and `-remoteWrite.streamAggr.config` contains an empty file. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5207).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not print redundant error logs when failed to scrape consul or nomad target. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5239).
+* BUGFIX: [vmstorage](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): prevent deleted series to be searchable via `/api/v1/series` API if they were re-ingested with staleness markers. This situation could happen if user deletes the series from the target and from VM, and then vmagent sends stale markers for absent series. Thanks to @ilyatrefilov for the [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069) and [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5174).
 
 ## [v1.93.6](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.6)
 
diff --git a/lib/storage/storage.go b/lib/storage/storage.go
index 57587e6247..f7b39e87c9 100644
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@@ -1835,10 +1835,11 @@ func (s *Storage) add(rows []rawRow, dstMrs []*MetricRow, mrs []MetricRow, preci
 
 	// Return only the first error, since it has no sense in returning all errors.
 	var firstWarn error
-	var isStaleNan bool
+
 	j := 0
 	for i := range mrs {
 		mr := &mrs[i]
+		var isStaleNan bool
 		if math.IsNaN(mr.Value) {
 			if !decimal.IsStaleNaN(mr.Value) {
 				// Skip NaNs other than Prometheus staleness marker, since the underlying encoding
@@ -1959,8 +1960,8 @@ func (s *Storage) add(rows []rawRow, dstMrs []*MetricRow, mrs []MetricRow, preci
 			continue
 		}
 
-		// If TSID was not found in cache and in indexdb, it's deleted. If metric contains stale
-		// marker, do not create a new TSID.
+		// If sample is stale and its TSID wasn't found in cache and in indexdb,
+		// then we skip it. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069
 		if isStaleNan {
 			j--
 			continue
diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go
index 0835658575..b79b131008 100644
--- a/lib/storage/storage_test.go
+++ b/lib/storage/storage_test.go
@@ -1362,61 +1362,64 @@ func TestStorageDeleteStaleSnapshots(t *testing.T) {
 }
 
 func TestStorageSeriesAreNotCreatedOnStaleMarkers(t *testing.T) {
-	rng := rand.New(rand.NewSource(1))
 	path := "TestStorageSeriesAreNotCreatedOnStaleMarkers"
 	s := MustOpenStorage(path, -1, 1e5, 1e6)
 
-	// Verify no label names exist
-	lns, err := s.SearchLabelNamesWithFiltersOnTimeRange(nil, nil, TimeRange{}, 1e5, 1e9, noDeadline)
-	if err != nil {
-		t.Fatalf("error in SearchLabelNamesWithFiltersOnTimeRange() at the start: %s", err)
-	}
-	if len(lns) != 0 {
-		t.Fatalf("found non-empty tag keys at the start: %q", lns)
+	tr := TimeRange{MinTimestamp: 0, MaxTimestamp: 2e10}
+	tfsAll := NewTagFilters()
+	if err := tfsAll.Add([]byte("__name__"), []byte(".*"), false, true); err != nil {
+		t.Fatalf("unexpected error in TagFilters.Add: %s", err)
 	}
 
-	// Add rows with stale markers to the storage
-	mrs := testGenerateStaleMetricRows(rng, 100, 0, 2e10)
+	findN := func(n int) {
+		t.Helper()
+		lns, err := s.SearchMetricNames(nil, []*TagFilters{tfsAll}, tr, 1e5, noDeadline)
+		if err != nil {
+			t.Fatalf("error in SearchLabelNamesWithFiltersOnTimeRange() at the start: %s", err)
+		}
+		if len(lns) != n {
+			fmt.Println(lns)
+			t.Fatalf("expected to find %d metric names, found %d instead", n, len(lns))
+		}
+	}
+
+	// db is empty, so should be search results
+	findN(0)
+
+	rng := rand.New(rand.NewSource(1))
+	mrs := testGenerateMetricRows(rng, 20, tr.MinTimestamp, tr.MaxTimestamp)
+	// populate storage with some rows
+	if err := s.AddRows(mrs[:10], defaultPrecisionBits); err != nil {
+		t.Fatal("error when adding mrs: %w", err)
+	}
+	s.DebugFlush()
+
+	// verify ingested rows are searchable
+	findN(10)
+
+	// clean up ingested data
+	_, err := s.DeleteSeries(nil, []*TagFilters{tfsAll})
+	if err != nil {
+		t.Fatalf("DeleteSeries failed: %s", err)
+	}
+
+	// verify that data was actually deleted
+	findN(0)
+
+	// mark every 2nd row as stale, simulating a stale target
+	for i := 0; i < len(mrs); i = i + 2 {
+		mrs[i].Value = decimal.StaleNaN
+	}
 	if err := s.AddRows(mrs, defaultPrecisionBits); err != nil {
 		t.Fatal("error when adding mrs: %w", err)
 	}
+	s.DebugFlush()
 
-	// Verify that tag keys were not created
-	lns, err = s.SearchLabelNamesWithFiltersOnTimeRange(nil, nil, TimeRange{}, 1e5, 1e9, noDeadline)
-	if err != nil {
-		t.Fatalf("error in SearchLabelNamesWithFiltersOnTimeRange() at the start: %s", err)
-	}
-	if len(lns) != 0 {
-		t.Fatalf("found non-empty tag keys at the start: %q", lns)
-	}
+	// verify that rows marked as stale aren't searchable
+	findN(10)
 
 	s.MustClose()
 	if err := os.RemoveAll(path); err != nil {
 		t.Fatalf("cannot remove %q: %s", path, err)
 	}
 }
-
-func testGenerateStaleMetricRows(rng *rand.Rand, rows uint64, timestampMin, timestampMax int64) []MetricRow {
-	var mrs []MetricRow
-	var mn MetricName
-	mn.Tags = []Tag{
-		{[]byte("job"), []byte("test_service")},
-		{[]byte("instance"), []byte("1.2.3.4")},
-	}
-
-	for i := 0; i < int(rows); i++ {
-		mn.MetricGroup = []byte(fmt.Sprintf("metric_%d", i))
-		metricNameRaw := mn.marshalRaw(nil)
-		timestamp := rng.Int63n(timestampMax - timestampMin)
-		value := decimal.StaleNaN
-
-		mr := MetricRow{
-			MetricNameRaw: metricNameRaw,
-			Timestamp:     timestamp,
-			Value:         value,
-		}
-		mrs = append(mrs, mr)
-	}
-
-	return mrs
-}