mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 15:16:42 +00:00
Merge branch 'public-single-node' into victorialogs-wip
This commit is contained in:
commit
07d244dab0
7 changed files with 48 additions and 18 deletions
|
@ -195,7 +195,8 @@ func SearchLabelNamesWithFiltersOnTimeRange(qt *querytracer.Tracer, tfss []*stor
|
|||
|
||||
// SearchLabelValuesWithFiltersOnTimeRange searches for label values for the given labelName, tfss and tr.
|
||||
func SearchLabelValuesWithFiltersOnTimeRange(qt *querytracer.Tracer, labelName string, tfss []*storage.TagFilters,
|
||||
tr storage.TimeRange, maxLabelValues, maxMetrics int, deadline uint64) ([]string, error) {
|
||||
tr storage.TimeRange, maxLabelValues, maxMetrics int, deadline uint64,
|
||||
) ([]string, error) {
|
||||
WG.Add(1)
|
||||
labelValues, err := Storage.SearchLabelValuesWithFiltersOnTimeRange(qt, labelName, tfss, tr, maxLabelValues, maxMetrics, deadline)
|
||||
WG.Done()
|
||||
|
@ -492,6 +493,7 @@ func writeStorageMetrics(w io.Writer, strg *storage.Storage) {
|
|||
|
||||
metrics.WriteCounterUint64(w, `vm_indexdb_items_added_total`, idbm.ItemsAdded)
|
||||
metrics.WriteCounterUint64(w, `vm_indexdb_items_added_size_bytes_total`, idbm.ItemsAddedSizeBytes)
|
||||
metrics.WriteCounterUint64(w, `vm_indexdb_items_dropped_total{reason="too_long_item"}`, idbm.TooLongItemsDroppedTotal)
|
||||
|
||||
metrics.WriteGaugeUint64(w, `vm_pending_rows{type="storage"}`, tm.PendingRows)
|
||||
metrics.WriteGaugeUint64(w, `vm_pending_rows{type="indexdb"}`, idbm.PendingItems)
|
||||
|
|
|
@ -140,17 +140,6 @@ groups:
|
|||
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.
|
||||
See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183"
|
||||
|
||||
- alert: ProcessNearFDLimits
|
||||
expr: (process_max_fds - process_open_fds) < 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
|
||||
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
||||
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
||||
Consider to increase the limit as fast as possible."
|
||||
|
||||
- alert: LabelsLimitExceededOnIngestion
|
||||
expr: increase(vm_metrics_with_dropped_labels_total[5m]) > 0
|
||||
for: 15m
|
||||
|
@ -176,4 +165,3 @@ groups:
|
|||
is saturated by more than 90% and vminsert won't be able to keep up.\n
|
||||
This usually means that more vminsert or vmstorage nodes must be added to the cluster in order to increase
|
||||
the total number of vminsert -> vmstorage links."
|
||||
|
||||
|
|
|
@ -87,3 +87,34 @@ groups:
|
|||
In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
|
||||
making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then
|
||||
it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."
|
||||
|
||||
- alert: IndexDBRecordsDrop
|
||||
expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}."
|
||||
description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process.
|
||||
For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number
|
||||
of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and
|
||||
`-maxLabelValueLen` command-line flags."
|
||||
|
||||
- alert: TooLongLabelValues
|
||||
expr: increase(vm_too_long_label_values_total[5m]) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "VictoriaMetrics truncates too long label values"
|
||||
description: "The maximum length of a label value is limited via `-maxLabelValueLen` cmd-line flag.
|
||||
Longer label values are truncated and may result into time series overlapping.
|
||||
Please, check your logs to find which labels were truncated and
|
||||
either reduce the size of label values or increase `-maxLabelValueLen`".
|
||||
|
||||
- alert: TooLongLabelNames
|
||||
expr: increase(vm_too_long_label_names_total[5m]) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "VictoriaMetrics truncates too long label names"
|
||||
description: "The maximum length of a label name is limited by 256 bytes.
|
||||
Longer label names are truncated and may result into time series overlapping.".
|
|
@ -43,6 +43,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
|||
* FEATURE: [dashboards/single](https://grafana.com/grafana/dashboards/10229): add `Network Usage` panel to `Resource Usage` row.
|
||||
* FEATURE: [dashboards/operator](https://grafana.com/grafana/dashboards/17869), [dashboards/backupmanager](https://grafana.com/grafana/dashboards/17798) and [dashboard/tenant-statistic](https://grafana.com/grafana/dashboards/16399): update dashboard to be compatible with Grafana 10+ version.
|
||||
* FEATURE: [dashboards/cluster](https://grafana.com/grafana/dashboards/11176): add new panel `Concurrent selects` to `vmstorage` row. The panel will show how many ongoing select queries are processed by vmstorage and should help to identify resource bottlenecks. See panel description for more details.
|
||||
* FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): use `$__interval` variable for offsets and look-behind windows in annotations. This should improve precision of `restarts` and `version change` annotations when zooming-in/zooming-out on the dashboards.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support aggregation and deduplication configs before replicating data to configured `-remoteWrite.url` destinations. This saves CPU and memory resources when incoming data needs to be aggregated or deduplicated once and then replicated to multiple destinations. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5467).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add service discovery support for [Vultr](https://www.vultr.com/). See [these docs](https://docs.victoriametrics.com/sd_configs/#vultr_sd_configs) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6041).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow configuring `-remoteWrite.disableOnDiskQueue` and `-remoteWrite.dropSamplesOnOverload` cmd-line flags per each `-remoteWrite.url`. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementaion!
|
||||
|
@ -52,7 +53,8 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
|||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): speed up retrieving rules files from object storages by skipping unchanged objects during reloading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6210).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): support reading [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) records in `-datasource.url`, `-remoteWrite.url` and `-remoteRead.url` command-line option. For example, `-remoteWrite.url=http://srv+victoria-metrics` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends data to one of the addresses. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
|
||||
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmrestore](https://docs.victoriametrics.com/vmrestore/), [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): add `-s3TLSInsecureSkipVerify` command-line flag for skipping TLS certificates verification when connecting to S3 endpoint.
|
||||
* FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): use `$__interval` variable for offsets and look-behind windows in annotations. This should improve precision of `restarts` and `version change` annotations when zooming-in/zooming-out on the dashboards.
|
||||
* FEATURE: expose metric `vm_indexdb_items_dropped_total` to track the number of IndexDB records that had to be dropped during ingestion. The reason of dropping the record will be annotated in `reason` label of the exposed metric. This change also comes with a new [alerting rule](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) to track changes of this metric.
|
||||
* FEATURE: [alerts-health](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml): add new alerting rules `TooLongLabelValues` and `TooLongLabelNames` to notify about truncation of label values or names respectively.
|
||||
|
||||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix bug that prevents the first query trace from expanding on click event. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6186). The issue was introduced in [v1.100.0](https://docs.victoriametrics.com/changelog/#v11000) release.
|
||||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix calendar display when `UTC+00:00` timezone is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6239).
|
||||
|
|
|
@ -1307,7 +1307,7 @@ Below is the output for `/path/to/vmselect -help`:
|
|||
-blockcache.missesBeforeCaching int
|
||||
The number of cache misses before putting the block into cache. Higher values may reduce indexdb/dataBlocks cache size at the cost of higher CPU and disk read usage (default 2)
|
||||
-cacheDataPath string
|
||||
Path to directory for cache files. Cache isn't saved if empty
|
||||
Path to directory for cache files. By default, the cache is not persisted.
|
||||
-cacheExpireDuration duration
|
||||
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
|
||||
-cluster.tls
|
||||
|
|
|
@ -276,7 +276,8 @@ func (ris *rawItemsShard) addItems(items [][]byte) ([][]byte, []*inmemoryBlock)
|
|||
if len(itemPrefix) > 128 {
|
||||
itemPrefix = itemPrefix[:128]
|
||||
}
|
||||
tooLongItemLogger.Errorf("skipping adding too long item to indexdb: len(item)=%d; it souldn't exceed %d bytes; item prefix=%q", len(item), maxInmemoryBlockSize, itemPrefix)
|
||||
tooLongItemsTotal.Add(1)
|
||||
tooLongItemLogger.Errorf("skipping adding too long item to indexdb: len(item)=%d; it shouldn't exceed %d bytes; item prefix=%q", len(item), maxInmemoryBlockSize, itemPrefix)
|
||||
}
|
||||
ris.ibs = ibs
|
||||
ris.mu.Unlock()
|
||||
|
@ -290,6 +291,8 @@ func (ris *rawItemsShard) updateFlushDeadline() {
|
|||
|
||||
var tooLongItemLogger = logger.WithThrottler("tooLongItem", 5*time.Second)
|
||||
|
||||
var tooLongItemsTotal atomic.Uint64
|
||||
|
||||
type partWrapper struct {
|
||||
// refCount is the number of references to partWrapper
|
||||
refCount atomic.Int32
|
||||
|
@ -575,6 +578,8 @@ type TableMetrics struct {
|
|||
IndexBlocksCacheMisses uint64
|
||||
|
||||
PartsRefCount uint64
|
||||
|
||||
TooLongItemsDroppedTotal uint64
|
||||
}
|
||||
|
||||
// TotalItemsCount returns the total number of items in the table.
|
||||
|
@ -632,6 +637,8 @@ func (tb *Table) UpdateMetrics(m *TableMetrics) {
|
|||
m.IndexBlocksCacheSizeMaxBytes = uint64(idxbCache.SizeMaxBytes())
|
||||
m.IndexBlocksCacheRequests = idxbCache.Requests()
|
||||
m.IndexBlocksCacheMisses = idxbCache.Misses()
|
||||
|
||||
m.TooLongItemsDroppedTotal += tooLongItemsTotal.Load()
|
||||
}
|
||||
|
||||
// AddItems adds the given items to the tb.
|
||||
|
|
Loading…
Reference in a new issue