From cd1145e5f469dc756e4a7d222935d0b0a875e6c7 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@gmail.com>
Date: Tue, 28 Apr 2020 12:51:36 +0300
Subject: [PATCH] app/vmselect: add
 `-search.estimatedSeriesCountAfterAggregation` command-line flag for tuning
 the probability of OOMs or false-positive `not enough memory` errors

---
 app/vmselect/promql/eval.go | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go
index 7b9df7f7c..a1e33a7fd 100644
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@@ -18,7 +18,10 @@ import (
 )
 
 var (
-	maxPointsPerTimeseries = flag.Int("search.maxPointsPerTimeseries", 30e3, "The maximum points per a single timeseries returned from the search")
+	maxPointsPerTimeseries               = flag.Int("search.maxPointsPerTimeseries", 30e3, "The maximum points per a single timeseries returned from the search")
+	estimatedSeriesCountAfterAggregation = flag.Int("search.estimatedSeriesCountAfterAggregation", 1000, "Estimated number of series returned by aggregation with grouping "+
+		"such as `sum(...) by (...)`. Increase this value in order to reduce the probability of OOMs. Reduce this value in order to reduce 'not enough memory' errors "+
+		"for queries containing aggregation with grouping")
 )
 
 // The minimum number of points per timeseries for enabling time rounding.
@@ -679,8 +682,7 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
 		if iafc.ae.Modifier.Op != "" {
 			// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
 			// since each group can have own set of time series in memory.
-			// Estimate the number of such groups is lower than 1000 :)
-			timeseriesLen *= 1000
+			timeseriesLen *= *estimatedSeriesCountAfterAggregation
 		}
 	}
 	rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
@@ -690,8 +692,8 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
 		rss.Cancel()
 		return nil, fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series; "+
 			"possible solutions are: reducing the number of matching time series; switching to node with more RAM; "+
-			"increasing -memory.allowedPercent; increasing `step` query arg (%gs)",
-			rollupPoints, rssLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
+			"increasing -memory.allowedPercent; increasing `step` query arg (%gs); reducing -search.estimatedSeriesCountAfterAggregation",
+			rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
 	}
 	defer rml.Put(uint64(rollupMemorySize))