mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
Introduce a flag for limiting the number of time series to delete (cluster version) (#7112)
### Describe Your Changes Introduce the `-search.maxDeleteSeries` flag that limits the number of time series that can be deleted with a single `/api/v1/admin/tsdb/delete_series` call. Currently, any number can be deleted and if the number is big (millions) then the operation may result in unaccounted CPU and memory usage spikes which in some cases may result in OOM kill (see #7027). The flag limits the number to 30k by default and the users may override it if needed at the vmstorage start time. Related issue: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7027 --------- Signed-off-by: Artem Fetishev <rtm@victoriametrics.com>
This commit is contained in:
parent
bf746f9975
commit
91c2b5b24d
6 changed files with 144 additions and 16 deletions
|
@ -3,7 +3,6 @@ package prometheus
|
|||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"math"
|
||||
"net"
|
||||
"net/http"
|
||||
|
@ -30,6 +29,7 @@ import (
|
|||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/querytracer"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"github.com/valyala/fastjson/fastfloat"
|
||||
)
|
||||
|
||||
|
@ -56,6 +56,7 @@ var (
|
|||
maxExportSeries = flag.Int("search.maxExportSeries", 10e6, "The maximum number of time series, which can be returned from /api/v1/export* APIs. This option allows limiting memory usage")
|
||||
maxTSDBStatusSeries = flag.Int("search.maxTSDBStatusSeries", 10e6, "The maximum number of time series, which can be processed during the call to /api/v1/status/tsdb. This option allows limiting memory usage")
|
||||
maxSeriesLimit = flag.Int("search.maxSeries", 30e3, "The maximum number of time series, which can be returned from /api/v1/series. This option allows limiting memory usage")
|
||||
maxDeleteSeries = flag.Int("search.maxDeleteSeries", 1e6, "The maximum number of time series, which can be deleted using /api/v1/admin/tsdb/delete_series. This option allows limiting memory usage")
|
||||
maxLabelsAPISeries = flag.Int("search.maxLabelsAPISeries", 1e6, "The maximum number of time series, which could be scanned when searching for the matching time series "+
|
||||
"at /api/v1/labels and /api/v1/label/.../values. This option allows limiting memory usage and CPU usage. See also -search.maxLabelsAPIDuration, "+
|
||||
"-search.maxTagKeys, -search.maxTagValues and -search.ignoreExtraFiltersAtLabelsAPI")
|
||||
|
@ -493,7 +494,7 @@ func DeleteHandler(startTime time.Time, at *auth.Token, r *http.Request) error {
|
|||
if !cp.IsDefaultTimeRange() {
|
||||
return fmt.Errorf("start=%d and end=%d args aren't supported. Remove these args from the query in order to delete all the matching metrics", cp.start, cp.end)
|
||||
}
|
||||
sq := storage.NewSearchQuery(at.AccountID, at.ProjectID, cp.start, cp.end, cp.filterss, 0)
|
||||
sq := storage.NewSearchQuery(at.AccountID, at.ProjectID, cp.start, cp.end, cp.filterss, *maxDeleteSeries)
|
||||
deletedCount, err := netstorage.DeleteSeries(nil, sq, cp.deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete time series: %w", err)
|
||||
|
|
|
@ -154,7 +154,7 @@ func (api *vmstorageAPI) DeleteSeries(qt *querytracer.Tracer, sq *storage.Search
|
|||
if len(tfss) == 0 {
|
||||
return 0, fmt.Errorf("missing tag filters")
|
||||
}
|
||||
return api.s.DeleteSeries(qt, tfss)
|
||||
return api.s.DeleteSeries(qt, tfss, maxMetrics)
|
||||
}
|
||||
|
||||
func (api *vmstorageAPI) RegisterMetricNames(qt *querytracer.Tracer, mrs []storage.MetricRow, _ uint64) error {
|
||||
|
|
|
@ -706,6 +706,13 @@ Some workloads may need fine-grained resource usage limits. In these cases the f
|
|||
Queries to this endpoint may take big amounts of CPU time and memory at `vmstorage` and `vmselect` when the database contains
|
||||
big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/faq/#what-is-high-churn-rate).
|
||||
In this case it might be useful to set the `-search.maxSeries` to quite low value in order limit CPU and memory usage.
|
||||
- `-search.maxDeleteSeries` at `vmselect` limits the number of unique time
|
||||
series that can be deleted by a single
|
||||
[/api/v1/admin/tsdb/delete_series](https://docs.victoriametrics.com/url-examples/#apiv1admintsdbdelete_series)
|
||||
call. Deleting too many time series may require big amount of CPU and memory
|
||||
at `vmstorage` and this limit guards against unplanned resource usage spikes.
|
||||
Also see [How to delete time series](#how-to-delete-time-series) section to
|
||||
learn about different ways of deleting series.
|
||||
- `-search.maxTagKeys` at `vmstorage` limits the number of items, which may be returned from
|
||||
[/api/v1/labels](https://docs.victoriametrics.com/url-examples/#apiv1labels). This endpoint is used mostly by Grafana
|
||||
for auto-completion of label names. Queries to this endpoint may take big amounts of CPU time and memory at `vmstorage` and `vmselect`
|
||||
|
|
|
@ -1716,12 +1716,14 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64, accoun
|
|||
return dst, false
|
||||
}
|
||||
|
||||
// DeleteTSIDs marks as deleted all the TSIDs matching the given tfss.
|
||||
// DeleteTSIDs marks as deleted all the TSIDs matching the given tfss and
|
||||
// updates or resets all caches where TSIDs and the corresponding MetricIDs may
|
||||
// be stored.
|
||||
//
|
||||
// The caller must reset all the caches which may contain the deleted TSIDs.
|
||||
//
|
||||
// Returns the number of metrics deleted.
|
||||
func (db *indexDB) DeleteTSIDs(qt *querytracer.Tracer, tfss []*TagFilters) (int, error) {
|
||||
// If the number of the series exceeds maxMetrics, no series will be deleted and
|
||||
// an error will be returned. Otherwise, the funciton returns the number of
|
||||
// series deleted.
|
||||
func (db *indexDB) DeleteTSIDs(qt *querytracer.Tracer, tfss []*TagFilters, maxMetrics int) (int, error) {
|
||||
qt = qt.NewChild("deleting series for %s", tfss)
|
||||
defer qt.Done()
|
||||
if len(tfss) == 0 {
|
||||
|
@ -1734,7 +1736,7 @@ func (db *indexDB) DeleteTSIDs(qt *querytracer.Tracer, tfss []*TagFilters) (int,
|
|||
MaxTimestamp: (1 << 63) - 1,
|
||||
}
|
||||
is := db.getIndexSearch(tfss[0].accountID, tfss[0].projectID, noDeadline)
|
||||
metricIDs, err := is.searchMetricIDs(qt, tfss, tr, 2e9)
|
||||
metricIDs, err := is.searchMetricIDs(qt, tfss, tr, maxMetrics)
|
||||
db.putIndexSearch(is)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
@ -1746,7 +1748,7 @@ func (db *indexDB) DeleteTSIDs(qt *querytracer.Tracer, tfss []*TagFilters) (int,
|
|||
db.doExtDB(func(extDB *indexDB) {
|
||||
var n int
|
||||
qtChild := qt.NewChild("deleting series from the previos indexdb")
|
||||
n, err = extDB.DeleteTSIDs(qtChild, tfss)
|
||||
n, err = extDB.DeleteTSIDs(qtChild, tfss, maxMetrics)
|
||||
qtChild.Donef("deleted %d series", n)
|
||||
deletedCount += n
|
||||
})
|
||||
|
|
|
@ -1335,11 +1335,13 @@ func (s *Storage) prefetchMetricNames(qt *querytracer.Tracer, accountID, project
|
|||
// ErrDeadlineExceeded is returned when the request times out.
|
||||
var ErrDeadlineExceeded = fmt.Errorf("deadline exceeded")
|
||||
|
||||
// DeleteSeries deletes all the series matching the given tfss.
|
||||
// DeleteSeries deletes the series matching the given tfss.
|
||||
//
|
||||
// Returns the number of metrics deleted.
|
||||
func (s *Storage) DeleteSeries(qt *querytracer.Tracer, tfss []*TagFilters) (int, error) {
|
||||
deletedCount, err := s.idb().DeleteTSIDs(qt, tfss)
|
||||
// If the number of the series exceeds maxMetrics, no series will be deleted and
|
||||
// an error will be returned. Otherwise, the funciton returns the number of
|
||||
// metrics deleted.
|
||||
func (s *Storage) DeleteSeries(qt *querytracer.Tracer, tfss []*TagFilters, maxMetrics int) (int, error) {
|
||||
deletedCount, err := s.idb().DeleteTSIDs(qt, tfss, maxMetrics)
|
||||
if err != nil {
|
||||
return deletedCount, fmt.Errorf("cannot delete tsids: %w", err)
|
||||
}
|
||||
|
|
|
@ -824,7 +824,7 @@ func testStorageDeleteSeries(s *Storage, workerNum int) error {
|
|||
if n := metricBlocksCount(tfs); n == 0 {
|
||||
return fmt.Errorf("expecting non-zero number of metric blocks for tfs=%s", tfs)
|
||||
}
|
||||
deletedCount, err := s.DeleteSeries(nil, []*TagFilters{tfs})
|
||||
deletedCount, err := s.DeleteSeries(nil, []*TagFilters{tfs}, 1e9)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete metrics: %w", err)
|
||||
}
|
||||
|
@ -836,7 +836,7 @@ func testStorageDeleteSeries(s *Storage, workerNum int) error {
|
|||
}
|
||||
|
||||
// Try deleting empty tfss
|
||||
deletedCount, err = s.DeleteSeries(nil, nil)
|
||||
deletedCount, err = s.DeleteSeries(nil, nil, 1e9)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete empty tfss: %w", err)
|
||||
}
|
||||
|
@ -884,6 +884,122 @@ func checkLabelNames(lns []string, lnsExpected map[string]bool) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func TestStorageDeleteSeries_TooManyTimeseries(t *testing.T) {
|
||||
defer testRemoveAll(t)
|
||||
|
||||
const numSeries = 1000
|
||||
rng := rand.New(rand.NewSource(1))
|
||||
var accountID uint32 = 1
|
||||
var projectID uint32 = 2
|
||||
mrs := testGenerateMetricRowsWithPrefixForTenantID(rng, accountID, projectID, numSeries, "metric", TimeRange{
|
||||
MinTimestamp: time.Now().Add(-100 * 24 * time.Hour).UnixMilli(),
|
||||
MaxTimestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
|
||||
s := MustOpenStorage(t.Name(), 0, 0, 0)
|
||||
defer s.MustClose()
|
||||
s.AddRows(mrs, defaultPrecisionBits)
|
||||
s.DebugFlush()
|
||||
|
||||
tfs := NewTagFilters(accountID, projectID)
|
||||
if err := tfs.Add(nil, []byte("metric.*"), false, true); err != nil {
|
||||
t.Fatalf("unexpected error in TagFilters.Add: %v", err)
|
||||
}
|
||||
maxSeries := numSeries - 1
|
||||
count, err := s.DeleteSeries(nil, []*TagFilters{tfs}, maxSeries)
|
||||
if err == nil {
|
||||
t.Errorf("expected an error but there hasn't been one")
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("unexpected deleted series count: got %d, want 0", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStorageDeleteSeries_CachesAreUpdatedOrReset(t *testing.T) {
|
||||
defer testRemoveAll(t)
|
||||
|
||||
tr := TimeRange{
|
||||
MinTimestamp: time.Now().Add(-100 * 24 * time.Hour).UnixMilli(),
|
||||
MaxTimestamp: time.Now().UnixMilli(),
|
||||
}
|
||||
mn := MetricName{MetricGroup: []byte("metric")}
|
||||
mr := MetricRow{
|
||||
MetricNameRaw: mn.marshalRaw(nil),
|
||||
Timestamp: tr.MaxTimestamp,
|
||||
Value: 123,
|
||||
}
|
||||
var (
|
||||
genTSID generationTSID
|
||||
tfssKey []byte
|
||||
)
|
||||
tfs := NewTagFilters(0, 0)
|
||||
if err := tfs.Add(nil, []byte("metric.*"), false, true); err != nil {
|
||||
t.Fatalf("unexpected error in TagFilters.Add: %v", err)
|
||||
}
|
||||
tfss := []*TagFilters{tfs}
|
||||
s := MustOpenStorage(t.Name(), 0, 0, 0)
|
||||
defer s.MustClose()
|
||||
|
||||
// Ensure caches are empty.
|
||||
if s.getTSIDFromCache(&genTSID, mr.MetricNameRaw) {
|
||||
t.Fatalf("tsidCache unexpected contents: got %v, want empty", genTSID)
|
||||
}
|
||||
tfssKey = marshalTagFiltersKey(nil, tfss, tr, true)
|
||||
if got, ok := s.idb().getMetricIDsFromTagFiltersCache(nil, tfssKey); ok {
|
||||
t.Fatalf("tagFiltersToMetricIDsCache unexpected contents: got %v, want empty", got)
|
||||
}
|
||||
if got := s.getDeletedMetricIDs().Len(); got != 0 {
|
||||
t.Fatalf("deletedMetricIDs cache: unexpected size: got %d, want empty", got)
|
||||
}
|
||||
|
||||
// Add one row, search it, and ensure that the tsidCache and
|
||||
// tagFiltersToMetricIDsCache are not empty but the deletedMetricIDs
|
||||
// cache is still empty.
|
||||
s.AddRows([]MetricRow{mr}, defaultPrecisionBits)
|
||||
s.DebugFlush()
|
||||
gotMetrics, err := s.SearchMetricNames(nil, tfss, tr, 1, noDeadline)
|
||||
if err != nil {
|
||||
t.Fatalf("SearchMetricNames() failed unexpectedly: %v", err)
|
||||
}
|
||||
wantMetrics := []string{string(mr.MetricNameRaw)}
|
||||
if reflect.DeepEqual(gotMetrics, wantMetrics) {
|
||||
t.Fatalf("SearchMetricNames() unexpected search result: got %v, want %v", gotMetrics, wantMetrics)
|
||||
}
|
||||
|
||||
if !s.getTSIDFromCache(&genTSID, mr.MetricNameRaw) {
|
||||
t.Fatalf("tsidCache was expected to contain a record but it did not")
|
||||
}
|
||||
metricID := genTSID.TSID.MetricID
|
||||
tfssKey = marshalTagFiltersKey(nil, tfss, tr, true)
|
||||
if _, ok := s.idb().getMetricIDsFromTagFiltersCache(nil, tfssKey); !ok {
|
||||
t.Fatalf("tagFiltersToMetricIDsCache was expected to contain a record but it did not")
|
||||
}
|
||||
if got := s.getDeletedMetricIDs().Len(); got != 0 {
|
||||
t.Fatalf("deletedMetricIDs cache unexpected size: got %d, want empty", got)
|
||||
}
|
||||
|
||||
// Delete the metric added earlier and ensure that the tsidCache and
|
||||
// tagFiltersToMetricIDsCache have been reset and the deletedMetricIDs
|
||||
// cache is now contains ID of the deleted metric.
|
||||
numDeletedSeries, err := s.DeleteSeries(nil, tfss, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("DeleteSeries() failed unexpectedly: %v", err)
|
||||
}
|
||||
if got, want := numDeletedSeries, 1; got != want {
|
||||
t.Fatalf("unexpected number of deleted series, got %d, want %d", got, want)
|
||||
}
|
||||
if s.getTSIDFromCache(&genTSID, mr.MetricNameRaw) {
|
||||
t.Fatalf("tsidCache unexpected contents: got %v, want empty", genTSID)
|
||||
}
|
||||
tfssKey = marshalTagFiltersKey(nil, tfss, tr, true)
|
||||
if got, ok := s.idb().getMetricIDsFromTagFiltersCache(nil, tfssKey); ok {
|
||||
t.Fatalf("tagFiltersToMetricIDsCache unexpected contents: got %v, want empty", got)
|
||||
}
|
||||
if got, want := s.getDeletedMetricIDs().AppendTo(nil), []uint64{metricID}; !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("deletedMetricIDs cache: unexpected contents: got %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStorageRegisterMetricNamesSerial(t *testing.T) {
|
||||
path := "TestStorageRegisterMetricNamesSerial"
|
||||
s := MustOpenStorage(path, 0, 0, 0)
|
||||
|
|
Loading…
Reference in a new issue