app/vmalert: update parsing for instant responses (#6859)

This change is made in attempt to reduce memory usage by vmalert when parsing big instant responses from VM/Prometheus. In a5c427bac4 vmalert switched from std json lib to fastjson lib in order to reduce amount of allocations, as according to highloaded profiles of vmalert the CPU is mostly spent on GC. But switching to fastjson resulted into excessive memory usage for cases when vmalert has to parse long json lines, which usually happens when instant response contains many `metric` objects. In this change we do a mixed parsing: 1. Slice of `metric` objects is parsed with std lib to keep mem low 2. Each `metric` object is parsed with fastjson to reduce allocs The benchmark results are the following: ``` pkg: github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource BenchmarkParsePrometheusResponse/Instant_std+fastjson-10 1760 668959 ns/op 280147 B/op 5781 allocs/op MBs allocated at heap: 493.078392 mallocs: 18655472 BenchmarkParsePrometheusResponse/Instant_fastjson-10 6109 198258 ns/op 172839 B/op 5548 allocs/op MBs allocated at heap: 1056.384464 mallocs: 34457184 BenchmarkParsePrometheusResponse/Instant_std-10 1287 950987 ns/op 451677 B/op 9619 allocs/op MBs allocated at heap: 580.802976 mallocs: 13351636 ``` The benchmark function code with mem measurement is available here https://gist.github.com/hagen1778/b9c3ca7f8ca7d6b21aec9777112c5810 The benchmark contains 3 results: 1. Instant_std+fastjson is the implementation in this change 2. Instant_fastjson-10 is the implementation from a5c427bac4 3. BenchmarkParsePrometheusResponse/Instant_std-10 is implementation before a5c427bac4 According to these results, this new implementation is slower than previous, but faster than before switching to fastjson. It also has lower number of allocations and roughly the same memory allocation on heap with GC turned off. --------- Other changes: 1. rm BenchmarkMetrics as it doesn't measure anything 2. simplify BenchmarkParsePrometheusResponse into BenchmarkPromInstantUnmarshal ### Describe Your Changes Please provide a brief description of the changes you made. Be as specific as possible to help others understand the purpose and impact of your modifications. ### Checklist The following checks are **mandatory**: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). Signed-off-by: hagen1778 <roman@victoriametrics.com>
2025-03-11 15:34:56 +00:00 · 2024-08-22 17:36:11 +02:00 · 2024-08-22 17:36:11 +02:00 · 70a94ea492
commit 70a94ea492
parent e35237920a
4 changed files with 21 additions and 36 deletions
--- a/app/vmalert/datasource/testdata/instant_response.json
+++ b/app/vmalert/datasource/testdata/instant_response.json
--- a/app/vmalert/datasource/vm_prom_api.go
+++ b/app/vmalert/datasource/vm_prom_api.go
@ -57,20 +57,23 @@ var jsonParserPool fastjson.ParserPool
 //	[{"metric":{"__name__":"up","job":"prometheus"},value": [ 1435781451.781,"1"]},
 //	{"metric":{"__name__":"up","job":"node"},value": [ 1435781451.781,"0"]}]
 func (pi *promInstant) Unmarshal(b []byte) error {
+	var metrics []json.RawMessage
+	// metrics slice could be large, so parsing it with fastjson could consume a lot of memory.
+	// We parse the slice with standard lib to keep mem usage low.
+	// And each metric object will be parsed with fastjson to reduce allocations.
+	if err := json.Unmarshal(b, &metrics); err != nil {
+		return fmt.Errorf("cannot unmarshal metrics: %w", err)
+	}
+
 	p := jsonParserPool.Get()
 	defer jsonParserPool.Put(p)

-	v, err := p.ParseBytes(b)
-	if err != nil {
-		return err
-	}
-
-	rows, err := v.Array()
-	if err != nil {
-		return fmt.Errorf("cannot find the top-level array of result objects: %w", err)
-	}
-	pi.ms = make([]Metric, len(rows))
-	for i, row := range rows {
+	pi.ms = make([]Metric, len(metrics))
+	for i, data := range metrics {
+		row, err := p.ParseBytes(data)
+		if err != nil {
+			return fmt.Errorf("cannot parse metric object: %w", err)
+		}
 		metric := row.Get("metric")
 		if metric == nil {
 			return fmt.Errorf("can't find `metric` object in %q", row)
--- a/app/vmalert/datasource/vm_prom_api_timing_test.go
+++ b/app/vmalert/datasource/vm_prom_api_timing_test.go
@ -1,43 +1,24 @@
 package datasource

 import (
-	"bytes"
-	"io"
-	"net/http"
 	"os"
 	"testing"
 )

-func BenchmarkMetrics(b *testing.B) {
-	payload := []byte(`[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests", "foo":"bar", "baz": "qux"},"value":[1583786140,"2000"]}]`)
-
-	var pi promInstant
-	if err := pi.Unmarshal(payload); err != nil {
-		b.Fatal(err.Error())
-	}
-	b.Run("Instant", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			_, _ = pi.metrics()
-		}
-	})
-}
-
-func BenchmarkParsePrometheusResponse(b *testing.B) {
-	req, _ := http.NewRequest("GET", "", nil)
-	resp := &http.Response{StatusCode: http.StatusOK}
+func BenchmarkPromInstantUnmarshal(b *testing.B) {
 	data, err := os.ReadFile("testdata/instant_response.json")
 	if err != nil {
 		b.Fatalf("error while reading file: %s", err)
 	}
-	resp.Body = io.NopCloser(bytes.NewReader(data))

-	b.Run("Instant", func(b *testing.B) {
+	// BenchmarkParsePrometheusResponse/Instant_std+fastjson-10                    1760            668959 ns/op          280147 B/op       5781 allocs/op
+	b.Run("Instant std+fastjson", func(b *testing.B) {
 		for i := 0; i < b.N; i++ {
-			_, err := parsePrometheusResponse(req, resp)
+			var pi promInstant
+			err = pi.Unmarshal(data)
 			if err != nil {
 				b.Fatalf("unexpected parse err: %s", err)
 			}
-			resp.Body = io.NopCloser(bytes.NewReader(data))
 		}
 	})
 }
--- a/docs/changelog/CHANGELOG.md
+++ b/docs/changelog/CHANGELOG.md
@ -45,6 +45,7 @@ The value of `instance` label for those scrape targets will be changed from `<ad
 * BUGFIX: `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): reduce CPU usage by limiting the number of concurrently running inserts. The issue was introduced in [this commit](https://github.com/VictoriaMetrics/VictoriaMetrics/commit/498fe1cfa523be5bfecaa372293c3cded85e75ab) starting from v1.101.0. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6733) issue for details.
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/metricsql/): fix calculation [histogram_quantile](https://docs.victoriametrics.com/metricsql/#histogram_quantile) over Prometheus buckets with inconsistent values. It was producing incorrect results in case lower buckets. The issue was introduced in [v1.102.0](https://docs.victoriametrics.com/changelog/#v11020) release, see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6714) for the details.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/), [vmctl](https://docs.victoriametrics.com/vmctl/) and snapshot API: verify correctness of URLs provided via cmd-line flags before executing HTTP requests. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6740) issue for details.
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): reduce memory usage when parsing responses with big number of metrics in response. The memory usage was increased in [v1.102.0-rc1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc1) after attempt to reduce CPU usage for heavy loaded vmalerts.
 * BUGFIX: all VictoriaMetrics components: forcefully set owner/group for release tars to 1000:1000. This helps to avoid unpacking [issues](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6788) on systems with limitations around UID:GID configuration. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6846).

 ## [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1)