diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index fa9e588fc..10f989cbd 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -46,6 +46,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/), [vmctl](https://docs.victoriametrics.com/vmctl/) and snapshot API: verify correctness of URLs provided via cmd-line flags before executing HTTP requests. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6740) issue for details. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): reduce memory usage when parsing responses with big number of metrics in response. The memory usage was increased in [v1.102.0-rc1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc1) after attempt to reduce CPU usage for heavy loaded vmalerts. * BUGFIX: all VictoriaMetrics components: forcefully set owner/group for release tars to 1000:1000. This helps to avoid unpacking [issues](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6788) on systems with limitations around UID:GID configuration. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6846). +* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Removes the fallback to global index search when the search using per-day index fails due to too many time series found (the global index will fail anyway with the same error and so the fallback is not needed and only slows down the search). See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6836) for details. * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): fix metric names registering in the per-day index for new dates for existing time series when making calls to `/tags/tagSeries` and `/tags/tagMultiSeries` handlers of [Grpahite API](https://docs.victoriametrics.com/#graphite-api-usage). See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6872/) for details. ## [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index e2e9bcc07..2d1f57225 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2365,6 +2365,12 @@ func (is *indexSearch) searchMetricIDs(qt *querytracer.Tracer, tfss []*TagFilter return sortedMetricIDs, nil } +func errTooManyTimeseries(maxMetrics int) error { + return fmt.Errorf("the number of matching timeseries exceeds %d; "+ + "either narrow down the search or increase -search.max* command-line flag values at vmselect; "+ + "see https://docs.victoriametrics.com/#resource-usage-limits", maxMetrics) +} + func (is *indexSearch) searchMetricIDsInternal(qt *querytracer.Tracer, tfss []*TagFilters, tr TimeRange, maxMetrics int) (*uint64set.Set, error) { qt = qt.NewChild("search for metric ids: filters=%s, timeRange=%s, maxMetrics=%d", tfss, &tr, maxMetrics) defer qt.Done() @@ -2401,32 +2407,30 @@ func (is *indexSearch) searchMetricIDsInternal(qt *querytracer.Tracer, tfss []*T return nil, err } if metricIDs.Len() > maxMetrics { - return nil, fmt.Errorf("the number of matching timeseries exceeds %d; either narrow down the search "+ - "or increase -search.max* command-line flag values at vmselect; see https://docs.victoriametrics.com/#resource-usage-limits", maxMetrics) + return nil, errTooManyTimeseries(maxMetrics) } } return metricIDs, nil } +const maxDaysForPerDaySearch = 40 + func (is *indexSearch) updateMetricIDsForTagFilters(qt *querytracer.Tracer, metricIDs *uint64set.Set, tfs *TagFilters, tr TimeRange, maxMetrics int) error { - err := is.tryUpdatingMetricIDsForDateRange(qt, metricIDs, tfs, tr, maxMetrics) - if err == nil { - // Fast path: found metricIDs by date range. - return nil - } - if !errors.Is(err, errFallbackToGlobalSearch) { - return err + minDate := uint64(tr.MinTimestamp) / msecPerDay + maxDate := uint64(tr.MaxTimestamp-1) / msecPerDay + if minDate <= maxDate && maxDate-minDate <= maxDaysForPerDaySearch { + // Fast path - search metricIDs by date range in the per-day inverted + // index. + is.db.dateRangeSearchCalls.Add(1) + qt.Printf("search metric ids in the per-day index") + return is.updateMetricIDsForDateRange(qt, metricIDs, tfs, minDate, maxDate, maxMetrics) } - // Slow path - fall back to search in the global inverted index. - qt.Printf("cannot find metric ids in per-day index; fall back to global index") + // Slow path - search metricIDs in the global inverted index. + qt.Printf("search metric ids in the global index") is.db.globalSearchCalls.Add(1) m, err := is.getMetricIDsForDateAndFilters(qt, 0, tfs, maxMetrics) if err != nil { - if errors.Is(err, errFallbackToGlobalSearch) { - return fmt.Errorf("the number of matching timeseries exceeds %d; either narrow down the search "+ - "or increase -search.max* command-line flag values at vmselect; see https://docs.victoriametrics.com/#resource-usage-limits", maxMetrics) - } return err } metricIDs.UnionMayOwn(m) @@ -2605,18 +2609,7 @@ func (is *indexSearch) updateMetricIDsForOrSuffix(prefix []byte, metricIDs *uint return loopsCount, nil } -var errFallbackToGlobalSearch = errors.New("fall back from per-day index search to global index search") - -const maxDaysForPerDaySearch = 40 - -func (is *indexSearch) tryUpdatingMetricIDsForDateRange(qt *querytracer.Tracer, metricIDs *uint64set.Set, tfs *TagFilters, tr TimeRange, maxMetrics int) error { - is.db.dateRangeSearchCalls.Add(1) - minDate := uint64(tr.MinTimestamp) / msecPerDay - maxDate := uint64(tr.MaxTimestamp-1) / msecPerDay - if minDate > maxDate || maxDate-minDate > maxDaysForPerDaySearch { - // Too much dates must be covered. Give up, since it may be slow. - return errFallbackToGlobalSearch - } +func (is *indexSearch) updateMetricIDsForDateRange(qt *querytracer.Tracer, metricIDs *uint64set.Set, tfs *TagFilters, minDate, maxDate uint64, maxMetrics int) error { if minDate == maxDate { // Fast path - query only a single date. m, err := is.getMetricIDsForDateAndFilters(qt, minDate, tfs, maxMetrics) @@ -2781,7 +2774,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(qt *querytracer.Tracer, dat } if m.Len() >= maxDateMetrics { // Too many time series found for the given (date). Fall back to global search. - return nil, errFallbackToGlobalSearch + return nil, errTooManyTimeseries(maxDateMetrics) } metricIDs = m qt.Printf("found %d metric ids", metricIDs.Len()) diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go index cd89823fa..94d100a5d 100644 --- a/lib/storage/storage_test.go +++ b/lib/storage/storage_test.go @@ -1162,6 +1162,35 @@ func testGenerateMetricRowsForTenant(accountID, projectID uint32, rng *rand.Rand } func testGenerateMetricRows(rng *rand.Rand, rows uint64, timestampMin, timestampMax int64) []MetricRow { + return testGenerateMetricRowsWithPrefix(rng, rows, "metric", TimeRange{timestampMin, timestampMax}) +} + +func testGenerateMetricRowsWithPrefixForTenantID(rng *rand.Rand, accountID, projectID uint32, rows uint64, prefix string, tr TimeRange) []MetricRow { + var mrs []MetricRow + var mn MetricName + mn.Tags = []Tag{ + {[]byte("job"), []byte("webservice")}, + {[]byte("instance"), []byte("1.2.3.4")}, + } + for i := 0; i < int(rows); i++ { + mn.AccountID = accountID + mn.ProjectID = projectID + mn.MetricGroup = []byte(fmt.Sprintf("%s_%d", prefix, i)) + metricNameRaw := mn.marshalRaw(nil) + timestamp := rng.Int63n(tr.MaxTimestamp-tr.MinTimestamp) + tr.MinTimestamp + value := rng.NormFloat64() * 1e6 + + mr := MetricRow{ + MetricNameRaw: metricNameRaw, + Timestamp: timestamp, + Value: value, + } + mrs = append(mrs, mr) + } + return mrs +} + +func testGenerateMetricRowsWithPrefix(rng *rand.Rand, rows uint64, prefix string, tr TimeRange) []MetricRow { var mrs []MetricRow var mn MetricName mn.Tags = []Tag{ @@ -1171,9 +1200,9 @@ func testGenerateMetricRows(rng *rand.Rand, rows uint64, timestampMin, timestamp for i := 0; i < int(rows); i++ { mn.AccountID = uint32(rand.Intn(2)) mn.ProjectID = uint32(rand.Intn(3)) - mn.MetricGroup = []byte(fmt.Sprintf("metric_%d", i)) + mn.MetricGroup = []byte(fmt.Sprintf("%s_%d", prefix, i)) metricNameRaw := mn.marshalRaw(nil) - timestamp := rng.Int63n(timestampMax-timestampMin) + timestampMin + timestamp := rng.Int63n(tr.MaxTimestamp-tr.MinTimestamp) + tr.MinTimestamp value := rng.NormFloat64() * 1e6 mr := MetricRow{ @@ -1629,6 +1658,223 @@ func testCountAllMetricNames(s *Storage, accountID, projectID uint32, tr TimeRan return len(names) } +func TestStorageSearchMetricNames_TooManyTimeseries(t *testing.T) { + defer testRemoveAll(t) + + const ( + numDays = 100 + numRows = 10 + ) + rng := rand.New(rand.NewSource(1)) + var ( + days []TimeRange + mrs []MetricRow + ) + for i := range numDays { + day := TimeRange{ + MinTimestamp: time.Date(2000, 1, i+1, 0, 0, 0, 0, time.UTC).UnixMilli(), + MaxTimestamp: time.Date(2000, 1, i+1, 23, 59, 59, 999, time.UTC).UnixMilli(), + } + days = append(days, day) + prefix1 := fmt.Sprintf("metric1_%d", i) + mrs = append(mrs, testGenerateMetricRowsWithPrefixForTenantID(rng, 0, 0, numRows, prefix1, day)...) + prefix2 := fmt.Sprintf("metric2_%d", i) + mrs = append(mrs, testGenerateMetricRowsWithPrefixForTenantID(rng, 0, 0, numRows, prefix2, day)...) + } + + type options struct { + path string + filters []string + tr TimeRange + maxMetrics int + wantErr bool + wantCount int + } + f := func(opts *options) { + t.Helper() + + s := MustOpenStorage(t.Name()+"/"+opts.path, 0, 0, 0) + defer s.MustClose() + s.AddRows(mrs, defaultPrecisionBits) + s.DebugFlush() + + var tfss []*TagFilters + for _, filter := range opts.filters { + filter := fmt.Sprintf("%s.*", filter) + tfs := NewTagFilters(0, 0) + if err := tfs.Add(nil, []byte(filter), false, true); err != nil { + t.Fatalf("unexpected error in TagFilters.Add: %v", err) + } + tfss = append(tfss, tfs) + } + + names, err := s.SearchMetricNames(nil, tfss, opts.tr, opts.maxMetrics, noDeadline) + gotErr := err != nil + if gotErr != opts.wantErr { + t.Errorf("SeachMetricNames(%v, %v, %d): unexpected error: got %v, want error to happen %v", []any{ + tfss, &opts.tr, opts.maxMetrics, err, opts.wantErr, + }...) + } + if got := len(names); got != opts.wantCount { + t.Errorf("SeachMetricNames(%v, %v, %d): unexpected metric name count: got %d, want %d", []any{ + tfss, &opts.tr, opts.maxMetrics, got, opts.wantCount, + }...) + } + } + + // Using one filter to search metric names within one day. The maxMetrics + // param is set to match exactly the number of time series that match the + // filter within that time range. Search operation must complete + // successfully. + f(&options{ + path: "OneDay/OneTagFilter/MaxMetricsNotExeeded", + filters: []string{"metric1"}, + tr: days[0], + maxMetrics: numRows, + wantCount: numRows, + }) + + // Using one filter to search metric names within one day. The maxMetrics + // param is less than the number of time series that match the filter + // within that time range. Search operation must fail. + f(&options{ + path: "OneDay/OneTagFilter/MaxMetricsExeeded", + filters: []string{"metric1"}, + tr: days[0], + maxMetrics: numRows - 1, + wantErr: true, + }) + + // Using two filters to search metric names within one day. The maxMetrics + // param is set to match exactly the number of time series that match the + // two filters within that time range. Search operation must complete + // successfully. + f(&options{ + path: "OneDay/TwoTagFilters/MaxMetricsNotExeeded", + filters: []string{"metric1", "metric2"}, + tr: days[0], + maxMetrics: numRows * 2, + wantCount: numRows * 2, + }) + + // Using two filters to search metric names within one day. The maxMetrics + // param is less than the number of time series that match the two filters + // within that time range. Search operation must fail. + f(&options{ + path: "OneDay/TwoTagFilters/MaxMetricsExeeded", + filters: []string{"metric1", "metric2"}, + tr: days[0], + maxMetrics: numRows*2 - 1, + wantErr: true, + }) + + // Using one filter to search metric names within two days. The maxMetrics + // param is set to match exactly the number of time series that match the + // filter within that time range. Search operation must complete + // successfully. + f(&options{ + path: "TwoDays/OneTagFilter/MaxMetricsNotExeeded", + filters: []string{"metric1"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[1].MaxTimestamp, + }, + maxMetrics: numRows * 2, + wantCount: numRows * 2, + }) + + // Using one filter to search metric names within two days. The maxMetrics + // param is less than the number of time series that match the filter + // within that time range. Search operation must fail. + f(&options{ + path: "TwoDays/OneTagFilter/MaxMetricsExeeded", + filters: []string{"metric1"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[1].MaxTimestamp, + }, + maxMetrics: numRows*2 - 1, + wantErr: true, + }) + + // Using two filters to search metric names within two days. The maxMetrics + // param is set to match exactly the number of time series that match the + // two filters within that time range. Search operation must complete + // successfully. + f(&options{ + path: "TwoDays/TwoTagFilters/MaxMetricsNotExeeded", + filters: []string{"metric1", "metric2"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[1].MaxTimestamp, + }, + maxMetrics: numRows * 4, + wantCount: numRows * 4, + }) + + // Using two filters to search metric names within two days. The maxMetrics + // param is less than the number of time series that match the two filters + // within that time range. Search operation must fail. + f(&options{ + path: "TwoDays/TwoTagFilters/MaxMetricsExeeded", + filters: []string{"metric1", "metric2"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[1].MaxTimestamp, + }, + maxMetrics: numRows*4 - 1, + wantErr: true, + }) + + // Using one filter to search metric names within the time range of 41 days. + // This time range corresponds to the day difference of 40 days, which is + // the max day difference when the per-day index is still used for + // searching. The maxMetrics param is set to match exactly the number of + // time series that match the filter within that time range. Search + // operation must complete successfully. + f(&options{ + path: "40Days/OneTagFilter/MaxMetricsNotExeeded", + filters: []string{"metric1"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[40].MaxTimestamp, + }, + maxMetrics: numRows * 41, + wantCount: numRows * 41, + }) + + // Using one filter to search metric names within the time range of 42 days. + // This time range corresponds to the day difference of 41 days, which is + // longer than than 40 days. In this case, the search is performed using + // global index instead of per-day index and the metric names will be + // searched within the entire retention period. The maxMetrics parameter, + // however, is set to the number of time series within the 42 days. The + // search must fail because the number of metrics will be much larger. + f(&options{ + path: "MoreThan40Days/OneTagFilter/MaxMetricsExeeded", + filters: []string{"metric1"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[41].MaxTimestamp, + }, + maxMetrics: numRows * 42, + wantErr: true, + }) + + // To fix the above case, the maxMetrics must be adjusted to be not less + // than the number of time series within the entire retention period. + f(&options{ + path: "MoreThan40Days/OneTagFilter/MaxMetricsNotExeeded", + filters: []string{"metric1"}, + tr: TimeRange{ + MinTimestamp: days[0].MinTimestamp, + MaxTimestamp: days[41].MaxTimestamp, + }, + maxMetrics: numRows * numDays, + wantCount: numRows * numDays, + }) +} + // testCountAllMetricIDs is a test helper function that counts the IDs of // all time series within the given time range. func testCountAllMetricIDs(s *Storage, tr TimeRange) int {