From 69d244e6fba675a50b8baf29aaf4b1df2d77a454 Mon Sep 17 00:00:00 2001
From: Nikolay <nik@victoriametrics.com>
Date: Fri, 24 May 2024 14:55:20 +0200
Subject: [PATCH 1/4] lib/mergeset: adds tracking for indexdb records drop
 (#6297)

It allows to create alert for possible item drops at indexdb. It may
happen, if ingested metric size exceeds max indexdb item size.

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
---
 app/vmstorage/main.go                |  4 +++-
 deployment/docker/alerts-cluster.yml |  3 +--
 deployment/docker/alerts-health.yml  | 11 +++++++++++
 deployment/docker/alerts.yml         |  2 +-
 docs/CHANGELOG.md                    |  3 ++-
 lib/mergeset/table.go                |  9 ++++++++-
 6 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go
index a902dc335..c40ab47bd 100644
--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@@ -195,7 +195,8 @@ func SearchLabelNamesWithFiltersOnTimeRange(qt *querytracer.Tracer, tfss []*stor
 
 // SearchLabelValuesWithFiltersOnTimeRange searches for label values for the given labelName, tfss and tr.
 func SearchLabelValuesWithFiltersOnTimeRange(qt *querytracer.Tracer, labelName string, tfss []*storage.TagFilters,
-	tr storage.TimeRange, maxLabelValues, maxMetrics int, deadline uint64) ([]string, error) {
+	tr storage.TimeRange, maxLabelValues, maxMetrics int, deadline uint64,
+) ([]string, error) {
 	WG.Add(1)
 	labelValues, err := Storage.SearchLabelValuesWithFiltersOnTimeRange(qt, labelName, tfss, tr, maxLabelValues, maxMetrics, deadline)
 	WG.Done()
@@ -492,6 +493,7 @@ func writeStorageMetrics(w io.Writer, strg *storage.Storage) {
 
 	metrics.WriteCounterUint64(w, `vm_indexdb_items_added_total`, idbm.ItemsAdded)
 	metrics.WriteCounterUint64(w, `vm_indexdb_items_added_size_bytes_total`, idbm.ItemsAddedSizeBytes)
+	metrics.WriteCounterUint64(w, `vm_indexdb_items_dropped_total{reason="too_long_item"}`, idbm.TooLongItemsDroppedTotal)
 
 	metrics.WriteGaugeUint64(w, `vm_pending_rows{type="storage"}`, tm.PendingRows)
 	metrics.WriteGaugeUint64(w, `vm_pending_rows{type="indexdb"}`, idbm.PendingItems)
diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml
index aab0850dc..c4d092990 100644
--- a/deployment/docker/alerts-cluster.yml
+++ b/deployment/docker/alerts-cluster.yml
@@ -175,5 +175,4 @@ groups:
           description: "The connection between vminsert (instance {{ $labels.instance }}) and vmstorage (instance {{ $labels.addr }})
             is saturated by more than 90% and vminsert won't be able to keep up.\n
             This usually means that more vminsert or vmstorage nodes must be added to the cluster in order to increase
-            the total number of vminsert -> vmstorage links."
-
+            the total number of vminsert -> vmstorage links."
\ No newline at end of file
diff --git a/deployment/docker/alerts-health.yml b/deployment/docker/alerts-health.yml
index 808850339..a3f2ea4a8 100644
--- a/deployment/docker/alerts-health.yml
+++ b/deployment/docker/alerts-health.yml
@@ -87,3 +87,14 @@ groups:
             In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
             making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 
             it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."
+
+      - alert: IndexDBRecordsDrop
+        expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
+        labels:
+          severity: critical
+        annotations:
+          summary: "IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}."
+          description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. 
+          For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number 
+          of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and 
+          `-maxLabelValueLen` command-line flags."
\ No newline at end of file
diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml
index f20122eae..7c7832c6f 100644
--- a/deployment/docker/alerts.yml
+++ b/deployment/docker/alerts.yml
@@ -130,4 +130,4 @@ groups:
           summary: "Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit"
           description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
            This prevents ingestion of metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
-           correctly or that clients which send these metrics aren't misbehaving."
+           correctly or that clients which send these metrics aren't misbehaving."
\ No newline at end of file
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 40c29a6a9..e109bb5a2 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -43,6 +43,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [dashboards/single](https://grafana.com/grafana/dashboards/10229): add `Network Usage` panel to `Resource Usage` row.
 * FEATURE: [dashboards/operator](https://grafana.com/grafana/dashboards/17869), [dashboards/backupmanager](https://grafana.com/grafana/dashboards/17798) and [dashboard/tenant-statistic](https://grafana.com/grafana/dashboards/16399): update dashboard to be compatible with Grafana 10+ version.
 * FEATURE: [dashboards/cluster](https://grafana.com/grafana/dashboards/11176): add new panel `Concurrent selects` to `vmstorage` row. The panel will show how many ongoing select queries are processed by vmstorage and should help to identify resource bottlenecks. See panel description for more details.
+* FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): use `$__interval` variable for offsets and look-behind windows in annotations. This should improve precision of `restarts` and `version change` annotations when zooming-in/zooming-out on the dashboards.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support aggregation and deduplication configs before replicating data to configured `-remoteWrite.url` destinations. This saves CPU and memory resources when incoming data needs to be aggregated or deduplicated once and then replicated to multiple destinations. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5467).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add service discovery support for [Vultr](https://www.vultr.com/). See [these docs](https://docs.victoriametrics.com/sd_configs/#vultr_sd_configs) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6041).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow configuring `-remoteWrite.disableOnDiskQueue` and `-remoteWrite.dropSamplesOnOverload` cmd-line flags per each `-remoteWrite.url`. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementaion!
@@ -52,7 +53,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): speed up retrieving rules files from object storages by skipping unchanged objects during reloading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6210).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): support reading [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) records in `-datasource.url`, `-remoteWrite.url` and `-remoteRead.url` command-line option. For example, `-remoteWrite.url=http://srv+victoria-metrics` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends data to one of the addresses. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
 * FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmrestore](https://docs.victoriametrics.com/vmrestore/), [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): add `-s3TLSInsecureSkipVerify` command-line flag for skipping TLS certificates verification when connecting to S3 endpoint.
-* FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): use `$__interval` variable for offsets and look-behind windows in annotations. This should improve precision of `restarts` and `version change` annotations when zooming-in/zooming-out on the dashboards.
+* FEATURE: expose metric `vm_indexdb_items_dropped_total` to track the number of IndexDB records that had to be dropped during ingestion. The reason of dropping the record will be annotated in `reason` label of the exposed metric. This change also comes with a new [alerting rule](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) to track changes of this metric.
 
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix bug that prevents the first query trace from expanding on click event. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6186). The issue was introduced in [v1.100.0](https://docs.victoriametrics.com/changelog/#v11000) release.
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix calendar display when `UTC+00:00` timezone is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6239).
diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go
index 7e78f3656..69911bddf 100644
--- a/lib/mergeset/table.go
+++ b/lib/mergeset/table.go
@@ -276,7 +276,8 @@ func (ris *rawItemsShard) addItems(items [][]byte) ([][]byte, []*inmemoryBlock)
 		if len(itemPrefix) > 128 {
 			itemPrefix = itemPrefix[:128]
 		}
-		tooLongItemLogger.Errorf("skipping adding too long item to indexdb: len(item)=%d; it souldn't exceed %d bytes; item prefix=%q", len(item), maxInmemoryBlockSize, itemPrefix)
+		tooLongItemsTotal.Add(1)
+		tooLongItemLogger.Errorf("skipping adding too long item to indexdb: len(item)=%d; it shouldn't exceed %d bytes; item prefix=%q", len(item), maxInmemoryBlockSize, itemPrefix)
 	}
 	ris.ibs = ibs
 	ris.mu.Unlock()
@@ -290,6 +291,8 @@ func (ris *rawItemsShard) updateFlushDeadline() {
 
 var tooLongItemLogger = logger.WithThrottler("tooLongItem", 5*time.Second)
 
+var tooLongItemsTotal atomic.Uint64
+
 type partWrapper struct {
 	// refCount is the number of references to partWrapper
 	refCount atomic.Int32
@@ -575,6 +578,8 @@ type TableMetrics struct {
 	IndexBlocksCacheMisses       uint64
 
 	PartsRefCount uint64
+
+	TooLongItemsDroppedTotal uint64
 }
 
 // TotalItemsCount returns the total number of items in the table.
@@ -632,6 +637,8 @@ func (tb *Table) UpdateMetrics(m *TableMetrics) {
 	m.IndexBlocksCacheSizeMaxBytes = uint64(idxbCache.SizeMaxBytes())
 	m.IndexBlocksCacheRequests = idxbCache.Requests()
 	m.IndexBlocksCacheMisses = idxbCache.Misses()
+
+	m.TooLongItemsDroppedTotal += tooLongItemsTotal.Load()
 }
 
 // AddItems adds the given items to the tb.

From 49f13b12d99bc2cfcbaa8d278c3d22f3fd4ba3ec Mon Sep 17 00:00:00 2001
From: hagen1778 <roman@victoriametrics.com>
Date: Fri, 24 May 2024 14:59:27 +0200
Subject: [PATCH 2/4] deployment/alerts: rm `ProcessNearFDLimits` alert from
 alerts-cluster

As it is already present in alerts-health file

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 deployment/docker/alerts-cluster.yml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml
index c4d092990..30e2eb50d 100644
--- a/deployment/docker/alerts-cluster.yml
+++ b/deployment/docker/alerts-cluster.yml
@@ -140,17 +140,6 @@ groups:
             for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.
             See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183"
 
-      - alert: ProcessNearFDLimits
-        expr: (process_max_fds - process_open_fds) < 100
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
-          summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
-          description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
-          Consider to increase the limit as fast as possible."
-
       - alert: LabelsLimitExceededOnIngestion
         expr: increase(vm_metrics_with_dropped_labels_total[5m]) > 0
         for: 15m

From 1be1e9a7a401bbe0c89249d9f425ff1e9b737da0 Mon Sep 17 00:00:00 2001
From: hagen1778 <roman@victoriametrics.com>
Date: Fri, 24 May 2024 15:09:52 +0200
Subject: [PATCH 3/4] deployment/alerts:  add new alerting rules
 `TooLongLabelValues` and `TooLongLabelNames` to notify about truncation of
 label values or names respectively.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 deployment/docker/alerts-health.yml | 22 +++++++++++++++++++++-
 docs/CHANGELOG.md                   |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/deployment/docker/alerts-health.yml b/deployment/docker/alerts-health.yml
index a3f2ea4a8..139860fea 100644
--- a/deployment/docker/alerts-health.yml
+++ b/deployment/docker/alerts-health.yml
@@ -97,4 +97,24 @@ groups:
           description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. 
           For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number 
           of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and 
-          `-maxLabelValueLen` command-line flags."
\ No newline at end of file
+          `-maxLabelValueLen` command-line flags."
+
+      - alert: TooLongLabelValues
+        expr: increase(vm_too_long_label_values_total[5m]) > 0
+        labels:
+          severity: critical
+        annotations:
+          summary: "VictoriaMetrics truncates too long label values"
+          description: "The maximum length of a label value is limited via `-maxLabelValueLen` cmd-line flag. 
+           Longer label values are truncated and may result into time series overlapping.
+           Please, check your logs to find which labels were truncated and  
+           either reduce the size of label values or increase `-maxLabelValueLen`".
+
+      - alert: TooLongLabelNames
+        expr: increase(vm_too_long_label_names_total[5m]) > 0
+        labels:
+          severity: critical
+        annotations:
+          summary: "VictoriaMetrics truncates too long label names"
+          description: "The maximum length of a label name is limited by 256 bytes. 
+           Longer label names are truncated and may result into time series overlapping.".
\ No newline at end of file
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index e109bb5a2..1be84519d 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -54,6 +54,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): support reading [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) records in `-datasource.url`, `-remoteWrite.url` and `-remoteRead.url` command-line option. For example, `-remoteWrite.url=http://srv+victoria-metrics` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends data to one of the addresses. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
 * FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmrestore](https://docs.victoriametrics.com/vmrestore/), [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): add `-s3TLSInsecureSkipVerify` command-line flag for skipping TLS certificates verification when connecting to S3 endpoint.
 * FEATURE: expose metric `vm_indexdb_items_dropped_total` to track the number of IndexDB records that had to be dropped during ingestion. The reason of dropping the record will be annotated in `reason` label of the exposed metric. This change also comes with a new [alerting rule](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) to track changes of this metric.
+* FEATURE: [alerts-health](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml): add new alerting rules `TooLongLabelValues` and `TooLongLabelNames` to notify about truncation of label values or names respectively. 
 
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix bug that prevents the first query trace from expanding on click event. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6186). The issue was introduced in [v1.100.0](https://docs.victoriametrics.com/changelog/#v11000) release.
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix calendar display when `UTC+00:00` timezone is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6239).

From 7a5000656eb572e6a9a0470e39ac2c8f2d122e20 Mon Sep 17 00:00:00 2001
From: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Date: Fri, 24 May 2024 15:58:54 +0200
Subject: [PATCH 4/4] app/vmselect: update flag description (#6347)

Update wording to highlight that cache is not persistent if flag is
value is empty. Previously, it was not clear if cache is not used at all
or just not persistent.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 docs/Cluster-VictoriaMetrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md
index 8e1816367..64a45c104 100644
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@@ -1307,7 +1307,7 @@ Below is the output for `/path/to/vmselect -help`:
   -blockcache.missesBeforeCaching int
      The number of cache misses before putting the block into cache. Higher values may reduce indexdb/dataBlocks cache size at the cost of higher CPU and disk read usage (default 2)
   -cacheDataPath string
-     Path to directory for cache files. Cache isn't saved if empty
+     Path to directory for cache files. By default, the cache is not persisted.
   -cacheExpireDuration duration
      Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
   -cluster.tls