app/vmalert/datasource: reduce number of allocations when parsing instant responses (#6272)

Allocations are reduced by implementing custom json parser via fastjson lib. The change also re-uses `promInstant` object in attempt to reduce number of allocations when parsing big responses, as usually happens with heavy recording rules. ``` name old allocs/op new allocs/op delta ParsePrometheusResponse/Instant-10 9.65k ± 0% 5.60k ± 0% ~ (p=1.000 n=1+1) ``` Signed-off-by: hagen1778 <roman@victoriametrics.com> --------- Signed-off-by: hagen1778 <roman@victoriametrics.com>
2025-03-11 15:34:56 +00:00 · 2024-05-15 15:18:33 +02:00 · 2024-05-15 15:18:33 +02:00 · 4f0525852f
commit 4f0525852f
parent 6fdba8599d
5 changed files with 189 additions and 29 deletions
--- a/app/vmalert/datasource/testdata/instant_response.json
+++ b/app/vmalert/datasource/testdata/instant_response.json
--- a/app/vmalert/datasource/vm_prom_api.go
+++ b/app/vmalert/datasource/vm_prom_api.go
@ -7,6 +7,10 @@ import (
 	"net/http"
 	"strconv"
 	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+
+	"github.com/valyala/fastjson"
 )

 var (
@ -31,27 +35,85 @@ type promResponse struct {
 	} `json:"stats,omitempty"`
 }

+// see https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries
 type promInstant struct {
-	Result []struct {
-		Labels map[string]string `json:"metric"`
-		TV     [2]interface{}    `json:"value"`
-	} `json:"result"`
+	// ms is populated after Unmarshal call
+	ms []Metric
 }

-func (r promInstant) metrics() ([]Metric, error) {
-	result := make([]Metric, len(r.Result))
-	for i, res := range r.Result {
-		f, err := strconv.ParseFloat(res.TV[1].(string), 64)
-		if err != nil {
-			return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
-		}
-		var m Metric
-		m.SetLabels(res.Labels)
-		m.Timestamps = append(m.Timestamps, int64(res.TV[0].(float64)))
-		m.Values = append(m.Values, f)
-		result[i] = m
+// metrics returned parsed Metric slice
+// Must be called only after Unmarshal
+func (pi *promInstant) metrics() ([]Metric, error) {
+	return pi.ms, nil
+}
+
+var jsonParserPool fastjson.ParserPool
+
+// Unmarshal unmarshals the given byte slice into promInstant
+// It is using fastjson to reduce number of allocations compared to
+// standard json.Unmarshal function.
+// Response example:
+//
+//	[{"metric":{"__name__":"up","job":"prometheus"},value": [ 1435781451.781,"1"]},
+//	{"metric":{"__name__":"up","job":"node"},value": [ 1435781451.781,"0"]}]
+func (pi *promInstant) Unmarshal(b []byte) error {
+	p := jsonParserPool.Get()
+	defer jsonParserPool.Put(p)
+
+	v, err := p.ParseBytes(b)
+	if err != nil {
+		return err
 	}
-	return result, nil
+
+	rows, err := v.Array()
+	if err != nil {
+		return fmt.Errorf("cannot find the top-level array of result objects: %w", err)
+	}
+	pi.ms = make([]Metric, len(rows))
+	for i, row := range rows {
+		metric := row.Get("metric")
+		if metric == nil {
+			return fmt.Errorf("can't find `metric` object in %q", row)
+		}
+		labels := metric.GetObject()
+
+		r := &pi.ms[i]
+		r.Labels = make([]Label, 0, labels.Len())
+		labels.Visit(func(key []byte, v *fastjson.Value) {
+			lv, errLocal := v.StringBytes()
+			if errLocal != nil {
+				err = fmt.Errorf("error when parsing label value %q: %s", v, errLocal)
+				return
+			}
+			r.Labels = append(r.Labels, Label{
+				Name:  string(key),
+				Value: string(lv),
+			})
+		})
+		if err != nil {
+			return fmt.Errorf("error when parsing `metric` object in %q: %w", row, err)
+		}
+
+		value := row.Get("value")
+		if value == nil {
+			return fmt.Errorf("can't find `value` object in %q", row)
+		}
+		sample := value.GetArray()
+		if len(sample) != 2 {
+			return fmt.Errorf("object `value` in %q should contain 2 values, but contains %d instead", row, len(sample))
+		}
+		r.Timestamps = []int64{sample[0].GetInt64()}
+		val, err := sample[1].StringBytes()
+		if err != nil {
+			return fmt.Errorf("error when parsing `value` object %q: %s", sample[1], err)
+		}
+		f, err := strconv.ParseFloat(bytesutil.ToUnsafeString(val), 64)
+		if err != nil {
+			return fmt.Errorf("error when parsing float64 from %s in %q: %w", sample[1], row, err)
+		}
+		r.Values = []float64{f}
+	}
+	return nil
 }

 type promRange struct {
@ -118,7 +180,7 @@ func parsePrometheusResponse(req *http.Request, resp *http.Response) (res Result
 	switch r.Data.ResultType {
 	case rtVector:
 		var pi promInstant
-		if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
+		if err := pi.Unmarshal(r.Data.Result); err != nil {
 			return res, fmt.Errorf("unmarshal err %w; \n %#v", err, string(r.Data.Result))
 		}
 		parseFn = pi.metrics
--- a/app/vmalert/datasource/vm_prom_api_test.go
+++ b/app/vmalert/datasource/vm_prom_api_test.go
@ -1,20 +1,73 @@
 package datasource

 import (
-	"encoding/json"
+	"reflect"
 	"testing"
 )

-func BenchmarkMetrics(b *testing.B) {
-	payload := []byte(`[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests", "foo":"bar", "baz": "qux"},"value":[1583786140,"2000"]}]`)
-
-	var pi promInstant
-	if err := json.Unmarshal(payload, &pi.Result); err != nil {
-		b.Fatalf(err.Error())
-	}
-	b.Run("Instant", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			_, _ = pi.metrics()
+func TestPromInstant_UnmarshalPositive(t *testing.T) {
+	f := func(data string, exp []Metric) {
+		t.Helper()
+		var pi promInstant
+		err := pi.Unmarshal([]byte(data))
+		if err != nil {
+			t.Fatalf("unexpected unmarshal err %v; \n %v", err, string(data))
 		}
+		got, _ := pi.metrics()
+		if !reflect.DeepEqual(got, exp) {
+			t.Fatalf("expected to get:\n%v\ngot instead:\n%v", exp, got)
+		}
+	}
+
+	f(`[{"metric":{"__name__":"up"},"value":[1583780000,"42"]}]`, []Metric{
+		{
+			Labels:     []Label{{Name: "__name__", Value: "up"}},
+			Timestamps: []int64{1583780000},
+			Values:     []float64{42},
+		},
+	})
+	f(`[
+{"metric":{"__name__":"up"},"value":[1583780000,"42"]},
+{"metric":{"__name__":"foo"},"value":[1583780001,"7"]},
+{"metric":{"__name__":"baz", "instance":"bar"},"value":[1583780002,"8"]}]`, []Metric{
+		{
+			Labels:     []Label{{Name: "__name__", Value: "up"}},
+			Timestamps: []int64{1583780000},
+			Values:     []float64{42},
+		},
+		{
+			Labels:     []Label{{Name: "__name__", Value: "foo"}},
+			Timestamps: []int64{1583780001},
+			Values:     []float64{7},
+		},
+		{
+			Labels:     []Label{{Name: "__name__", Value: "baz"}, {Name: "instance", Value: "bar"}},
+			Timestamps: []int64{1583780002},
+			Values:     []float64{8},
+		},
 	})
 }
+
+func TestPromInstant_UnmarshalNegative(t *testing.T) {
+	f := func(data string) {
+		t.Helper()
+		var pi promInstant
+		err := pi.Unmarshal([]byte(data))
+		if err == nil {
+			t.Fatalf("expected to get an error; got nil instead")
+		}
+	}
+	f(``)
+	f(`foo`)
+	f(`[{"metric":{"__name__":"up"},"value":[1583780000,"42"]},`)
+	f(`[{"metric":{"__name__"},"value":[1583780000,"42"]},`)
+	// no `metric` object
+	f(`[{"value":[1583780000,"42"]}]`)
+	// no `value` object
+	f(`[{"metric":{"__name__":"up"}}]`)
+	// less than 2 values in `value` object
+	f(`[{"metric":{"__name__":"up"},"value":["42"]}]`)
+	f(`[{"metric":{"__name__":"up"},"value":[1583780000]}]`)
+	// non-numeric sample value
+	f(`[{"metric":{"__name__":"up"},"value":[1583780000,"foo"]}]`)
+}
--- a/app/vmalert/datasource/vm_prom_api_timing_test.go
+++ b/app/vmalert/datasource/vm_prom_api_timing_test.go
@ -0,0 +1,43 @@
+package datasource
+
+import (
+	"bytes"
+	"io"
+	"net/http"
+	"os"
+	"testing"
+)
+
+func BenchmarkMetrics(b *testing.B) {
+	payload := []byte(`[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests", "foo":"bar", "baz": "qux"},"value":[1583786140,"2000"]}]`)
+
+	var pi promInstant
+	if err := pi.Unmarshal(payload); err != nil {
+		b.Fatalf(err.Error())
+	}
+	b.Run("Instant", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, _ = pi.metrics()
+		}
+	})
+}
+
+func BenchmarkParsePrometheusResponse(b *testing.B) {
+	req, _ := http.NewRequest("GET", "", nil)
+	resp := &http.Response{StatusCode: http.StatusOK}
+	data, err := os.ReadFile("testdata/instant_response.json")
+	if err != nil {
+		b.Fatalf("error while reading file: %s", err)
+	}
+	resp.Body = io.NopCloser(bytes.NewReader(data))
+
+	b.Run("Instant", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, err := parsePrometheusResponse(req, resp)
+			if err != nil {
+				b.Fatalf("unexpected parse err: %s", err)
+			}
+			resp.Body = io.NopCloser(bytes.NewReader(data))
+		}
+	})
+}
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -44,6 +44,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow configuring `-remoteWrite.disableOnDiskQueue` and `-remoteWrite.dropSamplesOnOverload` cmd-line flags per each `-remoteWrite.url`. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementaion!
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add labels `path` and `url` to metrics `vmagent_remotewrite_push_failures_total` and `vmagent_remotewrite_samples_dropped_total`. Now number of failed pushes and dropped samples can be tracked per `-remoteWrite.url`.
 * FEATURE: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): add [rate_sum](https://docs.victoriametrics.com/stream-aggregation/#rate_sum) and [rate_avg](https://docs.victoriametrics.com/stream-aggregation/#rate_avg) aggregation outputs.
+* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): reduce CPU usage when evaluating high number of alerting and recording rules.

 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix bug that prevents the first query trace from expanding on click event. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6186). The issue was introduced in [v1.100.0](https://docs.victoriametrics.com/changelog/#v11000) release.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): prevent potential panic during [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html) if more than one `--remoteWrite.streamAggr.dedupInterval` is configured. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6205).