Merge branch 'public-single-node' into pmm-6401-read-prometheus-data-files

2024-12-01 14:47:38 +00:00 · 2021-03-29 19:15:52 +03:00 · 2021-03-29 19:15:52 +03:00 · 11ded82e60
commit 11ded82e60
parent 558b390ebc 947b37ba8e
36 changed files with 2082 additions and 285 deletions
--- a/app/vmagent/README.md
+++ b/app/vmagent/README.md
@ -358,6 +358,10 @@ It may be useful to perform `vmagent` rolling update without any scrape loss.
  Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage. Therefore it starts dropping the buffered data
  if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.

+* `vmagent` drops data blocks if remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses. The number of dropped blocks can be monitored via `vmagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring).
+
+* Use `-remoteWrite.queues=1` when `-remoteWrite.url` points to remote storage, which doesn't accept out-of-order samples (aka data backfilling). Such storage systems include Prometheus, Cortex and Thanos.
+
 * `vmagent` buffers scraped data at the `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
  The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
  If you don't want to send all the data from the directory to remote storage then simply stop `vmagent` and delete the directory.
--- a/app/vmagent/remotewrite/client.go
+++ b/app/vmagent/remotewrite/client.go
@ -259,13 +259,13 @@ again:
 		return true
 	}
 	metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
-	if statusCode == 409 {
+	if statusCode == 409 || statusCode == 400 {
 		// Just drop block on 409 status code like Prometheus does.
 		// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
-		body, _ := ioutil.ReadAll(resp.Body)
+		// drop block on 400 status code,
+		// not expected that remote server will be able to handle it on retry
+		// should fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
 		_ = resp.Body.Close()
-		logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+
-			"response body=%q", len(block), c.sanitizedURL, statusCode, body)
 		c.packetsDropped.Inc()
 		return true
 	}
--- a/app/vmselect/promql/rollup_result_cache.go
+++ b/app/vmselect/promql/rollup_result_cache.go
@ -29,12 +29,16 @@ var (
 // ResetRollupResultCacheIfNeeded resets rollup result cache if mrs contains timestamps outside `now - search.cacheTimestampOffset`.
 func ResetRollupResultCacheIfNeeded(mrs []storage.MetricRow) {
 	checkRollupResultCacheResetOnce.Do(func() {
+		rollupResultResetMetricRowSample.Store(&storage.MetricRow{})
 		go checkRollupResultCacheReset()
 	})
 	minTimestamp := int64(fasttime.UnixTimestamp()*1000) - cacheTimestampOffset.Milliseconds() + checkRollupResultCacheResetInterval.Milliseconds()
 	needCacheReset := false
 	for i := range mrs {
 		if mrs[i].Timestamp < minTimestamp {
+			var mr storage.MetricRow
+			mr.CopyFrom(&mrs[i])
+			rollupResultResetMetricRowSample.Store(&mr)
 			needCacheReset = true
 			break
 		}
@ -49,6 +53,10 @@ func checkRollupResultCacheReset() {
 	for {
 		time.Sleep(checkRollupResultCacheResetInterval)
 		if atomic.SwapUint32(&needRollupResultCacheReset, 0) > 0 {
+			mr := rollupResultResetMetricRowSample.Load().(*storage.MetricRow)
+			d := int64(fasttime.UnixTimestamp()*1000) - mr.Timestamp - cacheTimestampOffset.Milliseconds()
+			logger.Warnf("resetting rollup result cache because the metric %s has a timestamp older than -search.cacheTimestampOffset=%s by %.3fs",
+				mr.String(), cacheTimestampOffset, float64(d)/1e3)
 			ResetRollupResultCache()
 		}
 	}
@ -58,6 +66,7 @@ const checkRollupResultCacheResetInterval = 5 * time.Second

 var needRollupResultCacheReset uint32
 var checkRollupResultCacheResetOnce sync.Once
+var rollupResultResetMetricRowSample atomic.Value

 var rollupResultCacheV = &rollupResultCache{
 	c: workingsetcache.New(1024*1024, time.Hour), // This is a cache for testing.
--- a/dashboards/victoriametrics.json
+++ b/dashboards/victoriametrics.json
@ -45,12 +45,12 @@
      }
    ]
  },
-  "description": "Overview for single node VictoriaMetrics v1.55.1 or higher",
+  "description": "Overview for single node VictoriaMetrics v1.56.0 or higher",
  "editable": true,
  "gnetId": 10229,
  "graphTooltip": 0,
  "id": null,
-  "iteration": 1615713966732,
+  "iteration": 1616956884194,
  "links": [
    {
      "icon": "doc",
@ -2654,7 +2654,7 @@
          "dashLength": 10,
          "dashes": false,
          "datasource": "$ds",
-          "description": "Shows how many of new time-series are created every second.  High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
+          "description": "Shows the rate and total number of new series created over last 24h.\n\nHigh churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nThe higher chur rate is, the more resources required to handle it. Consider to keep the churn rate as low as possible.\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
          "fieldConfig": {
            "defaults": {
              "custom": {},
@ -2668,7 +2668,7 @@
            "h": 8,
            "w": 12,
            "x": 0,
-            "y": 85
+            "y": 32
          },
          "hiddenSeries": false,
          "id": 66,
@ -2689,15 +2689,27 @@
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
-          "seriesOverrides": [],
+          "seriesOverrides": [
+            {
+              "alias": "new series over 24h",
+              "yaxis": 2
+            }
+          ],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(rate(vm_new_timeseries_created_total{job=\"$job\", instance=\"$instance\"}[5m]))",
+              "interval": "",
              "legendFormat": "churn rate",
              "refId": "A"
+            },
+            {
+              "expr": "sum(increase(vm_new_timeseries_created_total{job=\"$job\", instance=\"$instance\"}[24h]))",
+              "interval": "",
+              "legendFormat": "new series over 24h",
+              "refId": "B"
            }
          ],
          "thresholds": [],
@ -2761,7 +2773,7 @@
            "h": 8,
            "w": 12,
            "x": 12,
-            "y": 85
+            "y": 32
          },
          "hiddenSeries": false,
          "id": 60,
@ -2859,7 +2871,7 @@
            "h": 9,
            "w": 12,
            "x": 0,
-            "y": 93
+            "y": 40
          },
          "hiddenSeries": false,
          "id": 68,
@ -2958,7 +2970,7 @@
            "h": 9,
            "w": 12,
            "x": 12,
-            "y": 93
+            "y": 40
          },
          "hiddenSeries": false,
          "id": 74,
--- a/dashboards/vmagent.json
+++ b/dashboards/vmagent.json
@ -51,12 +51,12 @@
      }
    ]
  },
-  "description": "Overview for VictoriaMetrics vmagent v1.40.0 or higher",
+  "description": "Overview for VictoriaMetrics vmagent v1.56.0 or higher",
  "editable": true,
  "gnetId": null,
  "graphTooltip": 1,
  "id": null,
-  "iteration": 1598997251171,
+  "iteration": 1616957263139,
  "links": [
    {
      "icon": "doc",
@ -1283,6 +1283,101 @@
        "alignLevel": null
      }
    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$ds",
+      "description": "Shows the rate of dropped data blocks in cases when remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.\n\nSee https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {},
+          "links": []
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "hiddenSeries": false,
+      "id": 79,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.1",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Data blocks dropped ($instance)",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
    {
      "collapsed": true,
      "datasource": "$ds",
@ -3105,6 +3200,7 @@
          "dashLength": 10,
          "dashes": false,
          "datasource": "$ds",
+          "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a",
          "fieldConfig": {
            "defaults": {
              "custom": {},
@ -3121,6 +3217,326 @@
            "y": 13
          },
          "hiddenSeries": false,
+          "id": 83,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": false,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pluginVersion": "7.1.1",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "max",
+              "color": "#C4162A"
+            }
+          ],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(process_open_fds{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "open",
+              "refId": "A"
+            },
+            {
+              "expr": "min(process_max_fds{job=~\"$job\", instance=~\"$instance\"})",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "max",
+              "refId": "B"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Open FDs ($instance)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 2,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$ds",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 13
+          },
+          "hiddenSeries": false,
+          "id": 39,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": false,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pluginVersion": "7.1.1",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"}) by(instance)",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Goroutines ($instance)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$ds",
+          "description": "Shows the number of bytes read/write from the storage layer when vmagent has to buffer data on disk or read already buffered data.",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 21
+          },
+          "hiddenSeries": false,
+          "id": 81,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": false,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pluginVersion": "7.1.1",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [
+            {
+              "alias": "read",
+              "transform": "negative-Y"
+            }
+          ],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(process_io_storage_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "read",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(rate(process_io_storage_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "write",
+              "refId": "B"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Disk writes/reads ($instance)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": null,
+              "format": "bytes",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$ds",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "links": []
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 21
+          },
+          "hiddenSeries": false,
          "id": 41,
          "legend": {
            "avg": false,
@ -3195,102 +3611,6 @@
            "alignLevel": null
          }
        },
-        {
-          "aliasColors": {},
-          "bars": false,
-          "dashLength": 10,
-          "dashes": false,
-          "datasource": "$ds",
-          "fieldConfig": {
-            "defaults": {
-              "custom": {},
-              "links": []
-            },
-            "overrides": []
-          },
-          "fill": 1,
-          "fillGradient": 0,
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 13
-          },
-          "hiddenSeries": false,
-          "id": 39,
-          "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
-            "show": false,
-            "total": false,
-            "values": false
-          },
-          "lines": true,
-          "linewidth": 1,
-          "links": [],
-          "nullPointMode": "null",
-          "percentage": false,
-          "pluginVersion": "7.1.1",
-          "pointradius": 2,
-          "points": false,
-          "renderer": "flot",
-          "seriesOverrides": [],
-          "spaceLength": 10,
-          "stack": false,
-          "steppedLine": false,
-          "targets": [
-            {
-              "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"}) by(instance)",
-              "format": "time_series",
-              "intervalFactor": 2,
-              "legendFormat": "{{instance}}",
-              "refId": "A"
-            }
-          ],
-          "thresholds": [],
-          "timeFrom": null,
-          "timeRegions": [],
-          "timeShift": null,
-          "title": "Goroutines ($instance)",
-          "tooltip": {
-            "shared": true,
-            "sort": 0,
-            "value_type": "individual"
-          },
-          "type": "graph",
-          "xaxis": {
-            "buckets": null,
-            "mode": "time",
-            "name": null,
-            "show": true,
-            "values": []
-          },
-          "yaxes": [
-            {
-              "decimals": 0,
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": "0",
-              "show": true
-            },
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            }
-          ],
-          "yaxis": {
-            "align": false,
-            "alignLevel": null
-          }
-        },
        {
          "aliasColors": {},
          "bars": false,
@ -3310,7 +3630,7 @@
            "h": 8,
            "w": 12,
            "x": 0,
-            "y": 21
+            "y": 29
          },
          "hiddenSeries": false,
          "id": 43,
--- a/deployment/docker/alerts.yml
+++ b/deployment/docker/alerts.yml
@ -106,6 +106,23 @@ groups:
            High Churn Rate tightly connected with database performance and may
            result in unexpected OOM's or slow queries."

+      - alert: TooHighChurnRate24h
+        expr: |
+          sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
+          >
+          (sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
+          summary: "Too high number of new series on \"{{ $labels.instance }}\" created over last 24h"
+          description: "The number of created new time series over last 24h is 3x times higher than
+            current number of active series on \"{{ $labels.instance }}\".\n
+            This effect is known as Churn Rate.\n
+            High Churn Rate tightly connected with database performance and may
+            result in unexpected OOM's or slow queries."
+
      - alert: TooHighSlowInsertsRate
        expr: |
          (
--- a/deployment/docker/docker-compose.yml
+++ b/deployment/docker/docker-compose.yml
@ -71,6 +71,7 @@ services:
      # display source of alerts in grafana
      - '-external.url=http://127.0.0.1:3000' #grafana outside container
      - '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":"{{$$expr|quotesEscape|crlfEscape|queryEscape}}"},{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]' ## when copypaste the line be aware of '$$' for escaping in '$expr'    networks:
+    networks:
      - vm_net
    restart: always
  alertmanager:
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -2,6 +2,10 @@

 # tip

+
+# [v1.57.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.57.0)
+
+* FEATURE: optimize query performance by up to 10x on systems with many CPU cores. See [this tweet](https://twitter.com/MetricsVictoria/status/1375064484860067840).
 * FEATURE: add the following metrics at `/metrics` page for every VictoraMetrics app:
  * `process_resident_memory_anon_bytes` - RSS share for memory allocated by the process itself.  This share cannot be freed by the OS, so it must be taken into account by OOM killer.
  * `process_resident_memory_file_bytes` - RSS share for page cache memory (aka memory-mapped files). This share can be freed by the OS at any time, so it must be ignored by OOM killer.
@ -9,12 +13,17 @@
  * `process_resident_memory_peak_bytes` - peak RSS usage for the process.
  * `process_virtual_memory_peak_bytes` - peak virtual memory usage for the process.
 * FEATURE: accept and enforce `extra_label=<label_name>=<label_value>` query arg at [Graphite APIs](https://victoriametrics.github.io/#graphite-api-usage).
-* FEATURE: use Influx field as metric name if measurement is empty and `-influxSkipSingleField` command-line is set. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
+* FEATURE: use Influx field as metric name if measurement is empty and `-influxSkipSingleField` command-line is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139).
 * FEATURE: vmagent: add `-promscrape.consul.waitTime` command-line flag for tuning the maximum wait time for Consul service discovery. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1144).
+* FEATURE: vmagent: add `vm_promscrape_discovery_kubernetes_stale_resource_versions_total` metric for monitoring the frequency of `too old resource version` errors during Kubernetes service discovery.
+* FEATURE: single-node VictoriaMetrics: log metrics with timestamps older than `-search.cacheTimestampOffset` compared to the current time. See [these docs](https://victoriametrics.github.io/#backfilling) for details.

 * BUGFIX: prevent from infinite loop on `{__graphite__="..."}` filters when a metric name contains `*`, `{` or `[` chars.
 * BUGFIX: prevent from infinite loop in `/metrics/find` and `/metrics/expand` [Graphite Metrics API handlers](https://victoriametrics.github.io/#graphite-metrics-api-usage) when they match metric names or labels with `*`, `{` or `[` chars.
 * BUGFIX: do not merge duplicate time series during requests to `/api/v1/query`. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1141
+* BUGFIX: vmagent: properly handle `too old resource version` error messages from Kubernetes watch API. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1150
+* BUGFIX: vmagent: do not retry sending data blocks if remote storage returns `400 Bad Request` error. The number of dropped blocks due to such errors can be monitored with `vmagent_remotewrite_packets_dropped_total` metrics. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
+* BUGFIX: properly calculate `summarize` and `*Series` functions in [Graphite Render API](https://victoriametrics.github.io/#graphite-render-api-usage).


 # [v1.56.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.56.0)
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@ -272,6 +272,8 @@ the update process. See [cluster availability](#cluster-availability) section fo
  - `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
  - `vmselect` continues serving partial responses if at least a single `vmstorage` node is available. If consistency over availability is preferred, then either pass `-search.denyPartialResponse` command-line flag to `vmselect` or pass `deny_partial_response=1` query arg in requests to `vmselect`.

+`vmselect` doesn't serve partial responses for API handlers returning raw datapoints - [`/api/v1/export*` endpoints](https://victoriametrics.github.io/#how-to-export-time-series), since users usually expect this data is always complete.
+
 Data replication can be used for increasing storage durability. See [these docs](#replication-and-data-safety) for details.


--- a/docs/vmagent.md
+++ b/docs/vmagent.md
@ -358,6 +358,10 @@ It may be useful to perform `vmagent` rolling update without any scrape loss.
  Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage. Therefore it starts dropping the buffered data
  if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.

+* `vmagent` drops data blocks if remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses. The number of dropped blocks can be monitored via `vmagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring).
+
+* Use `-remoteWrite.queues=1` when `-remoteWrite.url` points to remote storage, which doesn't accept out-of-order samples (aka data backfilling). Such storage systems include Prometheus, Cortex and Thanos.
+
 * `vmagent` buffers scraped data at the `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
  The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
  If you don't want to send all the data from the directory to remote storage then simply stop `vmagent` and delete the directory.
--- a/go.mod
+++ b/go.mod
@ -18,7 +18,7 @@ require (
 	github.com/go-kit/kit v0.10.0
 	github.com/golang/snappy v0.0.3
 	github.com/influxdata/influxdb v1.8.4
-	github.com/klauspost/compress v1.11.12
+	github.com/klauspost/compress v1.11.13
 	github.com/mattn/go-runewidth v0.0.10 // indirect
 	github.com/oklog/ulid v1.3.1
 	github.com/prometheus/client_golang v1.10.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -514,8 +514,9 @@ github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0
 github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.10.7/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/compress v1.11.0/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
-github.com/klauspost/compress v1.11.12 h1:famVnQVu7QwryBN4jNseQdUKES71ZAOnB6UQQJPZvqk=
 github.com/klauspost/compress v1.11.12/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.11.13 h1:eSvu8Tmq6j2psUJqJrLcWH6K3w5Dwc+qipbaA6eVEN4=
+github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
 github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
--- a/lib/promscrape/discovery/kubernetes/api_watcher.go
+++ b/lib/promscrape/discovery/kubernetes/api_watcher.go
@ -312,6 +312,7 @@ type urlWatcher struct {
 	objectsAdded          *metrics.Counter
 	objectsRemoved        *metrics.Counter
 	objectsUpdated        *metrics.Counter
+	staleResourceVersions *metrics.Counter
 }

 func newURLWatcher(role, apiURL string, gw *groupWatcher) *urlWatcher {
@ -333,6 +334,7 @@ func newURLWatcher(role, apiURL string, gw *groupWatcher) *urlWatcher {
 		objectsAdded:          metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_objects_added_total{role=%q}`, role)),
 		objectsRemoved:        metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_objects_removed_total{role=%q}`, role)),
 		objectsUpdated:        metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_objects_updated_total{role=%q}`, role)),
+		staleResourceVersions: metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_stale_resource_versions_total{role=%q}`, role)),
 	}
 	logger.Infof("started %s watcher for %q", uw.role, uw.apiURL)
 	go uw.watchForUpdates()
@ -502,14 +504,15 @@ func (uw *urlWatcher) watchForUpdates() {
 			continue
 		}
 		if resp.StatusCode != http.StatusOK {
-			body, _ := ioutil.ReadAll(resp.Body)
-			_ = resp.Body.Close()
-			logger.Errorf("unexpected status code for request to %q: %d; want %d; response: %q", requestURL, resp.StatusCode, http.StatusOK, body)
 			if resp.StatusCode == 410 {
 				// There is no need for sleep on 410 error. See https://kubernetes.io/docs/reference/using-api/api-concepts/#410-gone-responses
 				backoffDelay = time.Second
+				uw.staleResourceVersions.Inc()
 				uw.setResourceVersion("")
 			} else {
+				body, _ := ioutil.ReadAll(resp.Body)
+				_ = resp.Body.Close()
+				logger.Errorf("unexpected status code for request to %q: %d; want %d; response: %q", requestURL, resp.StatusCode, http.StatusOK, body)
 				backoffSleep()
 			}
 			continue
@ -580,13 +583,25 @@ func (uw *urlWatcher) readObjectUpdateStream(r io.Reader) error {
 				return fmt.Errorf("cannot parse bookmark from %q: %w", we.Object, err)
 			}
 			uw.setResourceVersion(bm.Metadata.ResourceVersion)
+		case "ERROR":
+			em, err := parseError(we.Object)
+			if err != nil {
+				return fmt.Errorf("cannot parse error message from %q: %w", we.Object, err)
+			}
+			if em.Code == 410 {
+				// See https://kubernetes.io/docs/reference/using-api/api-concepts/#410-gone-responses
+				uw.staleResourceVersions.Inc()
+				uw.setResourceVersion("")
+				return nil
+			}
+			return fmt.Errorf("unexpected error message: %q", we.Object)
 		default:
-			return fmt.Errorf("unexpected WatchEvent type %q for role %q", we.Type, uw.role)
+			return fmt.Errorf("unexpected WatchEvent type %q: %q", we.Type, we.Object)
 		}
 	}
 }

-// Bookmark is a bookmark from Kubernetes Watch API.
+// Bookmark is a bookmark message from Kubernetes Watch API.
 // See https://kubernetes.io/docs/reference/using-api/api-concepts/#watch-bookmarks
 type Bookmark struct {
 	Metadata struct {
@ -602,6 +617,19 @@ func parseBookmark(data []byte) (*Bookmark, error) {
 	return &bm, nil
 }

+// Error is an error message from Kubernetes Watch API.
+type Error struct {
+	Code int
+}
+
+func parseError(data []byte) (*Error, error) {
+	var em Error
+	if err := json.Unmarshal(data, &em); err != nil {
+		return nil, err
+	}
+	return &em, nil
+}
+
 func getAPIPaths(role string, namespaces []string, selectors []Selector) []string {
 	objectName := getObjectNameByRole(role)
 	if objectName == "nodes" || len(namespaces) == 0 {
--- a/lib/storage/merge.go
+++ b/lib/storage/merge.go
@ -179,11 +179,14 @@ func mergeBlocks(ob, ib1, ib2 *Block, retentionDeadline int64, rowsDeleted *uint
 func skipSamplesOutsideRetention(b *Block, retentionDeadline int64, rowsDeleted *uint64) {
 	timestamps := b.timestamps
 	nextIdx := b.nextIdx
+	nextIdxOrig := nextIdx
 	for nextIdx < len(timestamps) && timestamps[nextIdx] < retentionDeadline {
 		nextIdx++
 	}
-	atomic.AddUint64(rowsDeleted, uint64(nextIdx-b.nextIdx))
+	if n := nextIdx - nextIdxOrig; n > 0 {
+		atomic.AddUint64(rowsDeleted, uint64(n))
 		b.nextIdx = nextIdx
+	}
 }

 func appendRows(ob, ib *Block) {
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@ -1272,7 +1272,7 @@ func (mr *MetricRow) String() string {
 	if err := mn.unmarshalRaw(mr.MetricNameRaw); err == nil {
 		metricName = mn.String()
 	}
-	return fmt.Sprintf("MetricName=%s, Timestamp=%d, Value=%f\n", metricName, mr.Timestamp, mr.Value)
+	return fmt.Sprintf("%s (Timestamp=%d, Value=%f)", metricName, mr.Timestamp, mr.Value)
 }

 // Marshal appends marshaled mr to dst and returns the result.
--- a/lib/uint64set/uint64set.go
+++ b/lib/uint64set/uint64set.go
@ -186,19 +186,6 @@ func (s *Set) Has(x uint64) bool {
 	hi32 := uint32(x >> 32)
 	lo32 := uint32(x)
 	bs := s.buckets
-	if len(bs) > 0 && bs[0].hi == hi32 {
-		// Manually inline bucket32.has for performance reasons.
-		hi16 := uint16(lo32 >> 16)
-		lo16 := uint16(lo32)
-		b32 := &bs[0]
-		his := b32.b16his
-		if n := b32.getHint(); n < uint32(len(his)) && his[n] == hi16 {
-			// Fast path - check the previously used bucket.
-			bs := b32.buckets
-			return n < uint32(len(bs)) && bs[n].has(lo16)
-		}
-		return b32.hasSlow(hi16, lo16)
-	}
 	for i := range bs {
 		b32 := &bs[i]
 		if b32.hi == hi32 {
@ -671,22 +658,13 @@ func (b *bucket32) addBucketAtPos(hi uint16, pos int) *bucket16 {
 func (b *bucket32) has(x uint32) bool {
 	hi := uint16(x >> 16)
 	lo := uint16(x)
-	his := b.b16his
-	if n := b.getHint(); n < uint32(len(his)) && his[n] == hi {
-		// Fast path - check the previously used bucket.
-		bs := b.buckets
-		return n < uint32(len(bs)) && bs[n].has(lo)
-	}
-	return b.hasSlow(hi, lo)
-}
-
-func (b *bucket32) hasSlow(hi, lo uint16) bool {
 	his := b.b16his
 	n := binarySearch16(his, hi)
 	if n < 0 || n >= len(his) || his[n] != hi {
 		return false
 	}
-	b.setHint(n)
+	// Do not call b.setHint(n) here, since this may trash performance
+	// when many concurrent goroutines call b.has() method from many CPU cores.
 	bs := b.buckets
 	return n < len(bs) && bs[n].has(lo)
 }
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@ -645,15 +645,15 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logNewTablePenalty = 4
-		d.window = make([]byte, maxStoreBlockSize)
+		d.w.logNewTablePenalty = 8
+		d.window = make([]byte, 32<<10)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
 	case level == DefaultCompression:
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 6
+		d.w.logNewTablePenalty = 8
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@ -6,6 +6,7 @@
 package flate

 import (
+	"encoding/binary"
 	"fmt"
 	"math/bits"
 )
@ -65,26 +66,15 @@ func load32(b []byte, i int) uint32 {
 }

 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }

 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }

 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }

 func hash(u uint32) uint32 {
@ -225,9 +215,9 @@ func (e *fastGen) Reset() {
 func matchLen(a, b []byte) int {
 	b = b[:len(a)]
 	var checked int
-	if len(a) > 4 {
+	if len(a) >= 4 {
 		// Try 4 bytes first
-		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+		if diff := binary.LittleEndian.Uint32(a) ^ binary.LittleEndian.Uint32(b); diff != 0 {
 			return bits.TrailingZeros32(diff) >> 3
 		}
 		// Switch to 8 byte matching.
@ -236,7 +226,7 @@ func matchLen(a, b []byte) int {
 		b = b[4:]
 		for len(a) >= 8 {
 			b = b[:len(a)]
-			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+			if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
 				return checked + (bits.TrailingZeros64(diff) >> 3)
 			}
 			checked += 8
@ -247,7 +237,7 @@ func matchLen(a, b []byte) int {
 	b = b[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@ -5,6 +5,7 @@
 package flate

 import (
+	"encoding/binary"
 	"io"
 )

@ -206,7 +207,7 @@ func (w *huffmanBitWriter) write(b []byte) {
 }

 func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	w.bits |= uint64(b) << (w.nbits & reg16SizeMask64)
+	w.bits |= uint64(b) << w.nbits
 	w.nbits += nb
 	if w.nbits >= 48 {
 		w.writeOutBits()
@ -420,13 +421,11 @@ func (w *huffmanBitWriter) writeOutBits() {
 	w.bits >>= 48
 	w.nbits -= 48
 	n := w.nbytes
-	w.bytes[n] = byte(bits)
-	w.bytes[n+1] = byte(bits >> 8)
-	w.bytes[n+2] = byte(bits >> 16)
-	w.bytes[n+3] = byte(bits >> 24)
-	w.bytes[n+4] = byte(bits >> 32)
-	w.bytes[n+5] = byte(bits >> 40)
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 	n += 6
+
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
@ -435,6 +434,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 		w.write(w.bytes[:n])
 		n = 0
 	}
+
 	w.nbytes = n
 }

@ -759,7 +759,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := lengths[lengthCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
+			w.bits |= uint64(c.code) << w.nbits
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
@ -779,7 +779,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := offs[offsetCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
+			w.bits |= uint64(c.code) << w.nbits
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
@ -830,8 +830,8 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
-	estBits += w.lastHeader + 15
+	estBits := histogramSize(input, w.literalFreq[:], !eof && !sync)
+	estBits += w.lastHeader + len(input)/32
 	if w.lastHeader == 0 {
 		estBits += guessHeaderSizeBits
 	}
@ -845,9 +845,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		return
 	}

+	reuseSize := 0
 	if w.lastHeader > 0 {
-		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
-		estBits += estExtra
+		reuseSize = w.literalEncoding.bitLength(w.literalFreq[:256])

 		if estBits < reuseSize {
 			// We owe an EOB
@ -859,6 +859,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	const numLiterals = endBlockMarker + 1
 	const numOffsets = 1
 	if w.lastHeader == 0 {
+		if !eof && !sync {
+			// Generate a slightly suboptimal tree that can be used for all.
+			fillHist(w.literalFreq[:numLiterals])
+		}
 		w.literalFreq[endBlockMarker] = 1
 		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)

@ -878,19 +882,14 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	for _, t := range input {
 		// Bitwriting inlined, ~30% speedup
 		c := encoding[t]
-		w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64)
+		w.bits |= uint64(c.code) << w.nbits
 		w.nbits += c.len
 		if w.nbits >= 48 {
 			bits := w.bits
 			w.bits >>= 48
 			w.nbits -= 48
 			n := w.nbytes
-			w.bytes[n] = byte(bits)
-			w.bytes[n+1] = byte(bits >> 8)
-			w.bytes[n+2] = byte(bits >> 16)
-			w.bytes[n+3] = byte(bits >> 24)
-			w.bytes[n+4] = byte(bits >> 32)
-			w.bytes[n+5] = byte(bits >> 40)
+			binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 			n += 6
 			if n >= bufferFlushSize {
 				if w.err != nil {
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@ -122,6 +122,16 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	return total
 }

+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		if f != 0 {
+			total += int(h.codes[f].len)
+		}
+	}
+	return total
+}
+
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@ -327,37 +337,40 @@ func atLeastOne(v float32) float32 {
 	return v
 }

+// Unassigned values are assigned '1' in the histogram.
+func fillHist(b []uint16) {
+	for i, v := range b {
+		if v == 0 {
+			b[i] = 1
+		}
+	}
+}
+
 // histogramSize accumulates a histogram of b in h.
 // An estimated size in bits is returned.
-// Unassigned values are assigned '1' in the histogram.
 // len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
+func histogramSize(b []byte, h []uint16, fill bool) (bits int) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
-	invTotal := 1.0 / float32(len(b))
-	shannon := float32(0.0)
-	var extra float32
+	total := len(b)
 	if fill {
-		oneBits := atLeastOne(-mFastLog2(invTotal))
-		for i, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			} else {
-				h[i] = 1
-				extra += oneBits
-			}
-		}
-	} else {
-		for _, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+		for _, v := range h {
+			if v == 0 {
+				total++
 			}
 		}
 	}

-	return int(shannon + 0.99), int(extra + 0.99)
+	invTotal := 1.0 / float32(total)
+	shannon := float32(0.0)
+	for _, v := range h {
+		if v > 0 {
+			n := float32(v)
+			shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+		}
+	}
+
+	return int(shannon + 0.99)
 }
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@ -155,7 +155,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {

 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
-				x := load6432(src, int32(i))
+				x := load6432(src, i)
 				nextHash := hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@ -301,7 +301,7 @@ func (s *Scratch) writeCount() error {
 	out[outP+1] = byte(bitStream >> 8)
 	outP += (bitCount + 7) / 8

-	if uint16(charnum) > s.symbolLen {
+	if charnum > s.symbolLen {
 		return errors.New("internal error: charnum > s.symbolLen")
 	}
 	s.Out = out[:outP]
@ -331,7 +331,7 @@ type cTable struct {
 func (s *Scratch) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@ -565,8 +565,8 @@ func (s *Scratch) normalizeCount2() error {
 		distributed  uint32
 		total        = uint32(s.br.remain())
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@ -591,7 +591,7 @@ func (s *Scratch) normalizeCount2() error {

 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@ -172,7 +172,7 @@ type decSymbol struct {
 // allocDtable will allocate decoding tables if they are not big enough.
 func (s *Scratch) allocDtable() {
 	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < int(tableSize) {
+	if cap(s.decTable) < tableSize {
 		s.decTable = make([]decSymbol, tableSize)
 	}
 	s.decTable = s.decTable[:tableSize]
@ -340,7 +340,7 @@ type decoder struct {
 func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
 	d.dt = dt
 	d.br = in
-	d.state = uint16(in.getBits(tableLog))
+	d.state = in.getBits(tableLog)
 }

 // next returns the next symbol and sets the next state.
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@ -403,7 +403,7 @@ func (s *Scratch) buildCTable() error {
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1

-	nodeNb := int16(startNode)
+	nodeNb := startNode
 	huffNode := s.nodes[1 : huffNodesLen+1]

 	// This overlays the slice above, but allows "-1" index lookups.
@ -580,7 +580,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {

 		// Get pos of last (smallest) symbol per rank
 		{
-			currentNbBits := uint8(maxNbBits)
+			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
 				if huffNode[pos].nbBits >= currentNbBits {
 					continue
--- a/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ b/vendor/github.com/klauspost/compress/snappy/snappy.go
@ -94,5 +94,5 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func crc(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@ -23,27 +23,43 @@ type blockEnc struct {
 	wr         bitWriter

 	extraLits         int
-	last      bool
-
 	output            []byte
 	recentOffsets     [3]uint32
 	prevRecentOffsets [3]uint32
+
+	last   bool
+	lowMem bool
 }

 // init should be used once the block has been created.
 // If called more than once, the effect is the same as calling reset.
 func (b *blockEnc) init() {
-	if cap(b.literals) < maxCompressedLiteralSize {
-		b.literals = make([]byte, 0, maxCompressedLiteralSize)
+	if b.lowMem {
+		// 1K literals
+		if cap(b.literals) < 1<<10 {
+			b.literals = make([]byte, 0, 1<<10)
+		}
+		const defSeqs = 20
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		// 1K
+		if cap(b.output) < 1<<10 {
+			b.output = make([]byte, 0, 1<<10)
+		}
+	} else {
+		if cap(b.literals) < maxCompressedBlockSize {
+			b.literals = make([]byte, 0, maxCompressedBlockSize)
 		}
 		const defSeqs = 200
-	b.literals = b.literals[:0]
 		if cap(b.sequences) < defSeqs {
 			b.sequences = make([]seq, 0, defSeqs)
 		}
 		if cap(b.output) < maxCompressedBlockSize {
 			b.output = make([]byte, 0, maxCompressedBlockSize)
 		}
+	}
+
 	if b.coders.mlEnc == nil {
 		b.coders.mlEnc = &fseEncoder{}
 		b.coders.mlPrev = &fseEncoder{}
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@ -7,6 +7,10 @@ import (
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )

+const (
+	dictShardBits = 6
+)
+
 type fastBase struct {
 	// cur is the offset at the start of hist
 	cur int32
@ -17,6 +21,7 @@ type fastBase struct {
 	tmp         [8]byte
 	blk         *blockEnc
 	lastDictID  uint32
+	lowMem      bool
 }

 // CRC returns the underlying CRC writer.
@ -57,15 +62,10 @@ func (e *fastBase) addBlock(src []byte) int32 {
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
 		if cap(e.hist) == 0 {
-			l := e.maxMatchOff * 2
-			// Make it at least 1MB.
-			if l < 1<<20 {
-				l = 1 << 20
-			}
-			e.hist = make([]byte, 0, l)
+			e.ensureHist(len(src))
 		} else {
-			if cap(e.hist) < int(e.maxMatchOff*2) {
-				panic("unexpected buffer size")
+			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
+				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
 			}
 			// Move down
 			offset := int32(len(e.hist)) - e.maxMatchOff
@ -79,6 +79,28 @@ func (e *fastBase) addBlock(src []byte) int32 {
 	return s
 }

+// ensureHist will ensure that history can keep at least this many bytes.
+func (e *fastBase) ensureHist(n int) {
+	if cap(e.hist) >= n {
+		return
+	}
+	l := e.maxMatchOff
+	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
+		l += maxCompressedBlockSize
+	} else {
+		l += e.maxMatchOff
+	}
+	// Make it at least 1MB.
+	if l < 1<<20 && !e.lowMem {
+		l = 1 << 20
+	}
+	// Make it at least the requested size.
+	if l < int32(n) {
+		l = int32(n)
+	}
+	e.hist = make([]byte, 0, l)
+}
+
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
 func (e *fastBase) UseBlock(enc *blockEnc) {
@ -117,7 +139,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 // Reset the encoding table.
 func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 	if e.blk == nil {
-		e.blk = &blockEnc{}
+		e.blk = &blockEnc{lowMem: e.lowMem}
 		e.blk.init()
 	} else {
 		e.blk.reset(nil)
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@ -407,6 +407,7 @@ encodeLoop:
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
 	e.Encode(blk, src)
 }

--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@ -16,6 +16,12 @@ const (
 	// This greatly depends on the type of input.
 	betterShortTableBits = 13                        // Bits used in the short match table
 	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+
+	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
+	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
+
+	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
+	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
 )

 type prevEntry struct {
@ -33,8 +39,15 @@ type betterFastEncoder struct {
 	fastBase
 	table     [betterShortTableSize]tableEntry
 	longTable [betterLongTableSize]prevEntry
+}
+
+type betterFastEncoderDict struct {
+	betterFastEncoder
 	dictTable            []tableEntry
 	dictLongTable        []prevEntry
+	shortTableShardDirty [betterShortTableShardCnt]bool
+	longTableShardDirty  [betterLongTableShardCnt]bool
+	allDirty             bool
 }

 // Encode improves compression...
@ -516,11 +529,511 @@ encodeLoop:
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
 	e.Encode(blk, src)
 }

+// Encode improves compression...
+func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			e.allDirty = true
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.allDirty = true
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				e.markLongShardDirty(nextHashL)
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.markLongShardDirty(h0)
+			h1 := hash5(cv1, betterShortTableBits)
+			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markShortShardDirty(h1)
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("betterFastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@ -557,6 +1070,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}

 	// Init or copy dict table
@ -585,11 +1099,72 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
-	// Reset table to initial state
-	copy(e.longTable[:], e.dictLongTable)

-	e.cur = e.maxMatchOff
 	// Reset table to initial state
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterShortTableShardCnt
+		const shardSize = betterShortTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
 			copy(e.table[:], e.dictTable)
+			for i := range e.shortTableShardDirty {
+				e.shortTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.shortTableShardDirty {
+				if !e.shortTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+				e.shortTableShardDirty[i] = false
+			}
+		}
+	}
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterLongTableShardCnt
+		const shardSize = betterLongTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.longTable[:], e.dictLongTable)
+			for i := range e.longTableShardDirty {
+				e.longTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.longTableShardDirty {
+				if !e.longTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
+				e.longTableShardDirty[i] = false
+			}
+		}
+	}
+	e.cur = e.maxMatchOff
+	e.allDirty = false
+}
+
+func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
+}
+
+func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
+	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
 }
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@ -11,6 +11,9 @@ const (
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
 	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.

+	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
+	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
+
 	dFastShortTableBits = tableBits                // Bits used in the short match table
 	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
 	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
@ -19,7 +22,13 @@ const (
 type doubleFastEncoder struct {
 	fastEncoder
 	longTable [dFastLongTableSize]tableEntry
+}
+
+type doubleFastEncoderDict struct {
+	fastEncoderDict
+	longTable           [dFastLongTableSize]tableEntry
 	dictLongTable       []tableEntry
+	longTableShardDirty [dLongTableShardCnt]bool
 }

 // Encode mimmics functionality in zstd_dfast.c
@ -678,9 +687,379 @@ encodeLoop:
 	}
 }

+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.markAllShardsDirty()
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.markAllShardsDirty()
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, dFastLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				e.markLongShardDirty(nextHashL)
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		longHash1 := hash8(cv0, dFastLongTableBits)
+		longHash2 := hash8(cv0, dFastLongTableBits)
+		e.longTable[longHash1] = te0
+		e.longTable[longHash2] = te1
+		e.markLongShardDirty(longHash1)
+		e.markLongShardDirty(longHash2)
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		hashVal1 := hash5(cv0, dFastShortTableBits)
+		hashVal2 := hash5(cv1, dFastShortTableBits)
+		e.table[hashVal1] = te0
+		e.markShardDirty(hashVal1)
+		e.table[hashVal2] = te1
+		e.markShardDirty(hashVal2)
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+	// If we encoded more than 64K mark all dirty.
+	if len(src) > 64<<10 {
+		e.markAllShardsDirty()
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 	e.fastEncoder.Reset(d, singleBlock)
+	if d != nil {
+		panic("doubleFastEncoder: Reset with dict not supported")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	allDirty := e.allDirty
+	e.fastEncoderDict.Reset(d, singleBlock)
 	if d == nil {
 		return
 	}
@ -706,8 +1085,37 @@ func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 	// Reset table to initial state
 	e.cur = e.maxMatchOff
+
+	dirtyShardCnt := 0
+	if !allDirty {
+		for i := range e.longTableShardDirty {
+			if e.longTableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
 		copy(e.longTable[:], e.dictLongTable)
+		for i := range e.longTableShardDirty {
+			e.longTableShardDirty[i] = false
+		}
+		return
+	}
+	for i := range e.longTableShardDirty {
+		if !e.longTableShardDirty[i] {
+			continue
+		}
+
+		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		e.longTableShardDirty[i] = false
+	}
+}
+
+func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
 }
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@ -13,6 +13,8 @@ import (
 const (
 	tableBits      = 15                               // Bits used in the table
 	tableSize      = 1 << tableBits                   // Size of the table
+	tableShardCnt  = 1 << (tableBits - dictShardBits) // Number of shards in the table
+	tableShardSize = tableSize / tableShardCnt        // Size of an individual shard
 	tableMask      = tableSize - 1                    // Mask for table indices. Redundant, but can eliminate bounds checks.
 	maxMatchLength = 131074
 )
@ -25,7 +27,13 @@ type tableEntry struct {
 type fastEncoder struct {
 	fastBase
 	table [tableSize]tableEntry
+}
+
+type fastEncoderDict struct {
+	fastEncoder
 	dictTable       []tableEntry
+	tableShardDirty [tableShardCnt]bool
+	allDirty        bool
 }

 // Encode mimmics functionality in zstd_fast.c
@ -617,8 +625,322 @@ encodeLoop:
 	}
 }

+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if e.allDirty || len(src) > 32<<10 {
+		e.fastEncoder.Encode(blk, src)
+		e.allDirty = true
+		return
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 7
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+			e.markShardDirty(nextHash2)
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debug {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
+
+			// Store this, since we have it.
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("fastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@ -653,9 +975,44 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}

 	e.cur = e.maxMatchOff
-	// Reset table to initial state
+	dirtyShardCnt := 0
+	if !e.allDirty {
+		for i := range e.tableShardDirty {
+			if e.tableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	const shardCnt = tableShardCnt
+	const shardSize = tableShardSize
+	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
 		copy(e.table[:], e.dictTable)
+		for i := range e.tableShardDirty {
+			e.tableShardDirty[i] = false
+		}
+		e.allDirty = false
+		return
+	}
+	for i := range e.tableShardDirty {
+		if !e.tableShardDirty[i] {
+			continue
+		}
+
+		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		e.tableShardDirty[i] = false
+	}
+	e.allDirty = false
+}
+
+func (e *fastEncoderDict) markAllShardsDirty() {
+	e.allDirty = true
+}
+
+func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
+	e.tableShardDirty[entryNum/tableShardSize] = true
 }
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@ -106,7 +106,7 @@ func (e *Encoder) Reset(w io.Writer) {
 		s.encoder = e.o.encoder()
 	}
 	if s.writing == nil {
-		s.writing = &blockEnc{}
+		s.writing = &blockEnc{lowMem: e.o.lowMem}
 		s.writing.init()
 	}
 	s.writing.initNewEncode()
@ -176,6 +176,12 @@ func (e *Encoder) nextBlock(final bool) error {
 	}
 	if !s.headerWritten {
 		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) == 0 && !e.o.fullZero {
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
 		if final && len(s.filling) > 0 {
 			s.current = e.EncodeAll(s.filling, s.current[:0])
 			var n2 int
@ -471,7 +477,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}

 	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
 	dst, err := fh.appendTo(dst)
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@ -24,12 +24,12 @@ type encoderOptions struct {
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
+	lowMem          bool
 	dict            *dict
 }

 func (o *encoderOptions) setDefault() {
 	*o = encoderOptions{
-		// use less ram: true for now, but may change.
 		concurrent:    runtime.GOMAXPROCS(0),
 		crc:           true,
 		single:        nil,
@ -37,20 +37,31 @@ func (o *encoderOptions) setDefault() {
 		windowSize:    8 << 20,
 		level:         SpeedDefault,
 		allLitEntropy: true,
+		lowMem:        false,
 	}
 }

 // encoder returns an encoder with the selected options.
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
-	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
-	case SpeedBetterCompression:
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
-	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	case SpeedFastest:
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		if o.dict != nil {
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+
+	case SpeedDefault:
+		if o.dict != nil {
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+		}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+	case SpeedBetterCompression:
+		if o.dict != nil {
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+	case SpeedBestCompression:
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@ -276,6 +287,17 @@ func WithSingleSegment(b bool) EOption {
 	}
 }

+// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
+// slower encoding speed.
+// This will not change the window size which is the primary function for reducing
+// memory usage. See WithWindowSize.
+func WithLowerEncoderMem(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.lowMem = b
+		return nil
+	}
+}
+
 // WithEncoderDict allows to register a dictionary that will be used for the encode.
 // The encoder *may* choose to use no dictionary instead for certain payloads.
 func WithEncoderDict(dict []byte) EOption {
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@ -97,7 +97,7 @@ func (s *fseEncoder) prepare() (*fseEncoder, error) {
 func (s *fseEncoder) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@ -202,13 +202,13 @@ func (s *fseEncoder) buildCTable() error {
 			case 0:
 			case -1, 1:
 				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int16(total - 1)
+				symbolTT[i].deltaFindState = total - 1
 				total++
 			default:
 				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
 				minStatePlus := uint32(v) << maxBitsOut
 				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int16(total - v)
+				symbolTT[i].deltaFindState = total - v
 				total += v
 			}
 		}
@ -353,8 +353,8 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 		distributed  uint32
 		total        = uint32(length)
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@ -379,7 +379,7 @@ func (s *fseEncoder) normalizeCount2(length int) error {

 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@ -417,7 +417,7 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func snappyCRC(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }

 // snappyDecodedLen returns the length of the decoded block and the number of bytes
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -127,7 +127,7 @@ github.com/jmespath/go-jmespath
 github.com/jstemmer/go-junit-report
 github.com/jstemmer/go-junit-report/formatter
 github.com/jstemmer/go-junit-report/parser
-# github.com/klauspost/compress v1.11.12
+# github.com/klauspost/compress v1.11.13
 ## explicit
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse