vendor: update github.com/klauspost/compress from v1.16.0 to v1.16.3

2025-03-11 15:34:56 +00:00 · 2023-03-14 16:14:25 -07:00 · 2023-03-14 16:14:25 -07:00 · 90e1818068
commit 90e1818068
parent 8f6d5217d1
20 changed files with 1540 additions and 69 deletions
--- a/go.mod
+++ b/go.mod
@ -23,7 +23,7 @@ require (
 	github.com/golang/snappy v0.0.4
 	github.com/googleapis/gax-go/v2 v2.7.1
 	github.com/influxdata/influxdb v1.11.0
-	github.com/klauspost/compress v1.16.0
+	github.com/klauspost/compress v1.16.3
 	github.com/prometheus/prometheus v0.42.0
 	github.com/urfave/cli/v2 v2.25.0
 	github.com/valyala/fastjson v1.6.4
--- a/go.sum
+++ b/go.sum
@ -313,8 +313,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/klauspost/compress v1.13.5/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
-github.com/klauspost/compress v1.16.0 h1:iULayQNOReoYUe+1qtKOqw9CwJv3aNQu8ivo7lw1HU4=
-github.com/klauspost/compress v1.16.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0PcNQY=
+github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
 github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@ -16,6 +16,21 @@ This package provides various compression algorithms.

 # changelog

+* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
+	* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
+	* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
+	* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
+	* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
+	* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
+
+* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
+	* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support.  https://github.com/klauspost/compress/pull/685
+	* s2: Add Compression Size Estimate.  https://github.com/klauspost/compress/pull/752
+	* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
+	* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
+	* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
+	* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
+
 * Jan 21st, 2023 (v1.15.15)
 	* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
 	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@ -260,7 +260,9 @@ func (s *Scratch) buildDtable() error {
 // If the buffer is over-read an error is returned.
 func (s *Scratch) decompress() error {
 	br := &s.bits
-	br.init(s.br.unread())
+	if err := br.init(s.br.unread()); err != nil {
+		return err
+	}

 	var s1, s2 decoder
 	// Initialize and decode first state and symbol.
--- a/vendor/github.com/klauspost/compress/gzhttp/README.md
+++ b/vendor/github.com/klauspost/compress/gzhttp/README.md
@ -215,6 +215,67 @@ has been reached. In this case it will assume that the minimum size has been rea

 If nothing has been written to the response writer, nothing will be flushed.

+## BREACH mitigation
+
+[BREACH](http://css.csail.mit.edu/6.858/2020/readings/breach.pdf) is a specialized attack where attacker controlled data
+is injected alongside secret data in a response body. This can lead to sidechannel attacks, where observing the compressed response
+size can reveal if there are overlaps between the secret data and the injected data.
+
+For more information see https://breachattack.com/
+
+It can be hard to judge if you are vulnerable to BREACH. 
+In general, if you do not include any user provided content in the response body you are safe,
+but if you do, or you are in doubt, you can apply mitigations.
+
+`gzhttp` can apply [Heal the Breach](https://ieeexplore.ieee.org/document/9754554), or improved content aware padding.
+
+```Go
+// RandomJitter adds 1->n random bytes to output based on checksum of payload.
+// Specify the amount of input to buffer before applying jitter.
+// This should cover the sensitive part of your response.
+// This can be used to obfuscate the exact compressed size.
+// Specifying 0 will use a buffer size of 64KB.
+// 'paranoid' will use a slower hashing function, that MAY provide more safety. 
+// If a negative buffer is given, the amount of jitter will not be content dependent.
+// This provides *less* security than applying content based jitter.
+func RandomJitter(n, buffer int, paranoid bool) option
+...	
+```
+
+The jitter is added as a "Comment" field. This field has a 1 byte overhead, so actual extra size will be 2 -> n+1 (inclusive).
+
+A good option would be to apply 32 random bytes, with default 64KB buffer: `gzhttp.RandomJitter(32, 0, false)`.
+
+Note that flushing the data forces the padding to be applied, which means that only data before the flush is considered for content aware padding.
+
+The *padding* in the comment is the text `Padding-Padding-Padding-Padding-Pad....`
+
+The *length* is `1 + crc32c(payload) MOD n` or `1 + sha256(payload) MOD n` (paranoid), or just random from `crypto/rand` if buffer < 0.
+
+### Paranoid?
+
+The padding size is determined by the remainder of a CRC32 of the content. 
+
+Since the payload contains elements unknown to the attacker, there is no reason to believe they can derive any information
+from this remainder, or predict it.
+
+However, for those that feel uncomfortable with a CRC32 being used for this can enable "paranoid" mode which will use SHA256 for determining the padding.
+
+The hashing itself is about 2 orders of magnitude slower, but in overall terms will maybe only reduce speed by 10%.
+
+Paranoid mode has no effect if buffer is < 0 (non-content aware padding).
+
+### Examples
+
+Adding the option `gzhttp.RandomJitter(32, 50000)` will apply from 1 up to 32 bytes of random data to the output.
+
+The number of bytes added depends on the content of the first 50000 bytes, or all of them if the output was less than that.
+
+Adding the option `gzhttp.RandomJitter(32, -1)` will apply from 1 up to 32 bytes of random data to the output.
+Each call will apply a random amount of jitter. This should be considered less secure than content based jitter.
+
+This can be used if responses are very big, deterministic and the buffer size would be too big to cover where the mutation occurs.  
+
 ## License

 [Apache 2.0](LICENSE)
--- a/vendor/github.com/klauspost/compress/gzhttp/compress.go
+++ b/vendor/github.com/klauspost/compress/gzhttp/compress.go
@ -2,8 +2,15 @@ package gzhttp

 import (
 	"bufio"
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/binary"
+	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"
+	"math"
+	"math/bits"
 	"mime"
 	"net"
 	"net/http"
@ -67,6 +74,9 @@ type GzipResponseWriter struct {
 	setContentType   bool   // Add content type, if missing and detected.
 	suffixETag       string // Suffix to add to ETag header if response is compressed.
 	dropETag         bool   // Drop ETag header if response is compressed (supersedes suffixETag).
+	sha256Jitter     bool   // Use sha256 for jitter.
+	randomJitter     string // Add random bytes to output as header field.
+	jitterBuffer     int    // Maximum buffer to accumulate before doing jitter.

 	contentTypeFilter func(ct string) bool // Only compress if the response is one of these content-types. All are accepted if empty.
 }
@ -97,6 +107,9 @@ func (w *GzipResponseWriter) Write(b []byte) (int, error) {
 	if w.minSize > wantBuf {
 		wantBuf = w.minSize
 	}
+	if w.jitterBuffer > 0 && w.jitterBuffer > wantBuf {
+		wantBuf = w.jitterBuffer
+	}
 	toAdd := len(b)
 	if len(w.buf)+toAdd > wantBuf {
 		toAdd = wantBuf - len(w.buf)
@ -112,7 +125,7 @@ func (w *GzipResponseWriter) Write(b []byte) (int, error) {
 		ct := hdr.Get(contentType)
 		if cl == 0 || cl >= w.minSize && (ct == "" || w.contentTypeFilter(ct)) {
 			// If the current buffer is less than minSize and a Content-Length isn't set, then wait until we have more data.
-			if len(w.buf) < w.minSize && cl == 0 {
+			if len(w.buf) < w.minSize && cl == 0 || (w.jitterBuffer > 0 && len(w.buf) < w.jitterBuffer) {
 				return len(b), nil
 			}

@ -131,7 +144,7 @@ func (w *GzipResponseWriter) Write(b []byte) (int, error) {

 				// If the Content-Type is acceptable to GZIP, initialize the GZIP writer.
 				if w.contentTypeFilter(ct) {
-					if err := w.startGzip(); err != nil {
+					if err := w.startGzip(remain); err != nil {
 						return 0, err
 					}
 					if len(remain) > 0 {
@ -156,8 +169,10 @@ func (w *GzipResponseWriter) Write(b []byte) (int, error) {
 	return len(b), nil
 }

+var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
+
 // startGzip initializes a GZIP writer and writes the buffer.
-func (w *GzipResponseWriter) startGzip() error {
+func (w *GzipResponseWriter) startGzip(remain []byte) error {
 	// Set the GZIP header.
 	w.Header().Set(contentEncoding, "gzip")

@ -199,6 +214,49 @@ func (w *GzipResponseWriter) startGzip() error {
 	if len(w.buf) > 0 {
 		// Initialize the GZIP response.
 		w.init()
+
+		// Set random jitter based on CRC or SHA-256 of current buffer.
+		// Before first write.
+		if len(w.randomJitter) > 0 {
+			var jitRNG uint32
+			if w.jitterBuffer > 0 {
+				if w.sha256Jitter {
+					h := sha256.New()
+					h.Write(w.buf)
+					// Use only up to "w.jitterBuffer", otherwise the output depends on write sizes.
+					if len(remain) > 0 && len(w.buf) < w.jitterBuffer {
+						remain := remain
+						if len(remain)+len(w.buf) > w.jitterBuffer {
+							remain = remain[:w.jitterBuffer-len(w.buf)]
+						}
+						h.Write(remain)
+					}
+					var tmp [sha256.Size]byte
+					jitRNG = binary.LittleEndian.Uint32(h.Sum(tmp[:0]))
+				} else {
+					h := crc32.Update(0, castagnoliTable, w.buf)
+					// Use only up to "w.jitterBuffer", otherwise the output depends on write sizes.
+					if len(remain) > 0 && len(w.buf) < w.jitterBuffer {
+						remain := remain
+						if len(remain)+len(w.buf) > w.jitterBuffer {
+							remain = remain[:w.jitterBuffer-len(w.buf)]
+						}
+						h = crc32.Update(h, castagnoliTable, remain)
+					}
+					jitRNG = bits.RotateLeft32(h, 19) ^ 0xab0755de
+				}
+			} else {
+				// Get from rand.Reader
+				var tmp [4]byte
+				_, err := rand.Read(tmp[:])
+				if err != nil {
+					return fmt.Errorf("gzhttp: %w", err)
+				}
+				jitRNG = binary.LittleEndian.Uint32(tmp[:])
+			}
+			jit := w.randomJitter[:1+jitRNG%uint32(len(w.randomJitter)-1)]
+			w.gw.(writer.GzipWriterExt).SetHeader(writer.Header{Comment: jit})
+		}
 		n, err := w.gw.Write(w.buf)

 		// This should never happen (per io.Writer docs), but if the write didn't
@ -259,15 +317,21 @@ func (w *GzipResponseWriter) Close() error {
 	if w.ignore {
 		return nil
 	}
-
 	if w.gw == nil {
-		// GZIP not triggered yet, write out regular response.
-		err := w.startPlain()
-		// Returns the error if any at write.
-		if err != nil {
-			err = fmt.Errorf("gziphandler: write to regular responseWriter at close gets error: %q", err.Error())
+		var (
+			ct = w.Header().Get(contentType)
+			ce = w.Header().Get(contentEncoding)
+			cr = w.Header().Get(contentRange)
+		)
+		// fmt.Println(len(w.buf) == 0, len(w.buf) < w.minSize, len(w.Header()[HeaderNoCompression]) != 0, ce != "", cr != "", !w.contentTypeFilter(ct))
+		if len(w.buf) == 0 || len(w.buf) < w.minSize || len(w.Header()[HeaderNoCompression]) != 0 || ce != "" || cr != "" || !w.contentTypeFilter(ct) {
+			// GZIP not triggered, write out regular response.
+			return w.startPlain()
+		}
+		err := w.startGzip(nil)
+		if err != nil {
+			return err
 		}
-		return err
 	}

 	err := w.gw.Close()
@ -310,7 +374,7 @@ func (w *GzipResponseWriter) Flush() {

 		// See if we should compress...
 		if len(w.Header()[HeaderNoCompression]) == 0 && ce == "" && cr == "" && cl >= w.minSize && w.contentTypeFilter(ct) {
-			w.startGzip()
+			w.startGzip(nil)
 		} else {
 			w.startPlain()
 		}
@ -392,6 +456,9 @@ func NewWrapper(opts ...option) (func(http.Handler) http.HandlerFunc, error) {
 					suffixETag:        c.suffixETag,
 					buf:               gw.buf,
 					setContentType:    c.setContentType,
+					randomJitter:      c.randomJitter,
+					jitterBuffer:      c.jitterBuffer,
+					sha256Jitter:      c.sha256Jitter,
 				}
 				if len(gw.buf) > 0 {
 					gw.buf = gw.buf[:0]
@ -408,6 +475,7 @@ func NewWrapper(opts ...option) (func(http.Handler) http.HandlerFunc, error) {
 				} else {
 					h.ServeHTTP(gw, r)
 				}
+				w.Header().Del(HeaderNoCompression)
 			} else {
 				h.ServeHTTP(newNoGzipResponseWriter(w), r)
 				w.Header().Del(HeaderNoCompression)
@ -455,6 +523,9 @@ type config struct {
 	setContentType   bool
 	suffixETag       string
 	dropETag         bool
+	jitterBuffer     int
+	randomJitter     string
+	sha256Jitter     bool
 }

 func (c *config) validate() error {
@ -466,7 +537,16 @@ func (c *config) validate() error {
 	if c.minSize < 0 {
 		return fmt.Errorf("minimum size must be more than zero")
 	}
-
+	if len(c.randomJitter) >= math.MaxUint16 {
+		return fmt.Errorf("random jitter size exceeded")
+	}
+	if len(c.randomJitter) > 0 {
+		gzw, ok := c.writer.New(io.Discard, c.level).(writer.GzipWriterExt)
+		if !ok {
+			return errors.New("the custom compressor does not allow setting headers for random jitter")
+		}
+		gzw.Close()
+	}
 	return nil
 }

@ -496,8 +576,9 @@ func SetContentType(b bool) option {

 // Implementation changes the implementation of GzipWriter
 //
-// The default implementation is writer/stdlib/NewWriter
-// which is backed by standard library's compress/zlib
+// The default implementation is backed by github.com/klauspost/compress
+// To support RandomJitter, the GzipWriterExt must also be
+// supported by the returned writers.
 func Implementation(writer writer.GzipWriterFactory) option {
 	return func(c *config) {
 		c.writer = writer
@ -625,6 +706,31 @@ func DropETag() option {
 	}
 }

+// RandomJitter adds 1->n random bytes to output based on checksum of payload.
+// Specify the amount of input to buffer before applying jitter.
+// This should cover the sensitive part of your response.
+// This can be used to obfuscate the exact compressed size.
+// Specifying 0 will use a buffer size of 64KB.
+// 'paranoid' will use a slower hashing function, that MAY provide more safety.
+// See README.md for more information.
+// If a negative buffer is given, the amount of jitter will not be content dependent.
+// This provides *less* security than applying content based jitter.
+func RandomJitter(n, buffer int, paranoid bool) option {
+	return func(c *config) {
+		if n > 0 {
+			c.sha256Jitter = paranoid
+			c.randomJitter = strings.Repeat("Padding-", 1+(n/8))[:n+1]
+			c.jitterBuffer = buffer
+			if c.jitterBuffer == 0 {
+				c.jitterBuffer = 64 << 10
+			}
+		} else {
+			c.randomJitter = ""
+			c.jitterBuffer = 0
+		}
+	}
+}
+
 // acceptsGzip returns true if the given HTTP request indicates that it will
 // accept a gzipped response.
 func acceptsGzip(r *http.Request) bool {
@ -702,10 +808,23 @@ func parseEncodings(s string) (codings, error) {
 	return c, nil
 }

+var errEmptyEncoding = errors.New("empty content-coding")
+
 // parseCoding parses a single coding (content-coding with an optional qvalue),
 // as might appear in an Accept-Encoding header. It attempts to forgive minor
 // formatting errors.
 func parseCoding(s string) (coding string, qvalue float64, err error) {
+	// Avoid splitting if we can...
+	if len(s) == 0 {
+		return "", 0, errEmptyEncoding
+	}
+	if !strings.ContainsRune(s, ';') {
+		coding = strings.ToLower(strings.TrimSpace(s))
+		if coding == "" {
+			err = errEmptyEncoding
+		}
+		return coding, DefaultQValue, err
+	}
 	for n, part := range strings.Split(s, ";") {
 		part = strings.TrimSpace(part)
 		qvalue = DefaultQValue
@ -724,7 +843,7 @@ func parseCoding(s string) (coding string, qvalue float64, err error) {
 	}

 	if coding == "" {
-		err = fmt.Errorf("empty content-coding")
+		err = errEmptyEncoding
 	}

 	return
@ -766,6 +885,9 @@ const intSize = 32 << (^uint(0) >> 63)

 // atoi is equivalent to ParseInt(s, 10, 0), converted to type int.
 func atoi(s string) (int, bool) {
+	if len(s) == 0 {
+		return 0, false
+	}
 	sLen := len(s)
 	if intSize == 32 && (0 < sLen && sLen < 10) ||
 		intSize == 64 && (0 < sLen && sLen < 19) {
--- a/vendor/github.com/klauspost/compress/gzhttp/writer/gzkp/gzkp.go
+++ b/vendor/github.com/klauspost/compress/gzhttp/writer/gzkp/gzkp.go
@ -61,6 +61,15 @@ func NewWriter(w io.Writer, level int) writer.GzipWriter {
 	}
 }

+// SetHeader will override the gzip header on pw.
+func (pw *pooledWriter) SetHeader(h writer.Header) {
+	pw.Name = h.Name
+	pw.Extra = h.Extra
+	pw.Comment = h.Comment
+	pw.ModTime = h.ModTime
+	pw.OS = h.OS
+}
+
 func Levels() (min, max int) {
 	return gzip.StatelessCompression, gzip.BestCompression
 }
--- a/vendor/github.com/klauspost/compress/gzhttp/writer/interface.go
+++ b/vendor/github.com/klauspost/compress/gzhttp/writer/interface.go
@ -1,6 +1,9 @@
 package writer

-import "io"
+import (
+	"io"
+	"time"
+)

 // GzipWriter implements the functions needed for compressing content.
 type GzipWriter interface {
@ -9,6 +12,24 @@ type GzipWriter interface {
 	Flush() error
 }

+// GzipWriterExt implements the functions needed for compressing content
+// and optional extensions.
+type GzipWriterExt interface {
+	GzipWriter
+
+	// SetHeader will populate header fields with non-nil values in h.
+	SetHeader(h Header)
+}
+
+// Header is a gzip header.
+type Header struct {
+	Comment string    // comment
+	Extra   []byte    // "extra data"
+	ModTime time.Time // modification time
+	Name    string    // file name
+	OS      byte      // operating system type
+}
+
 // GzipWriterFactory contains the information needed for custom gzip implementations.
 type GzipWriterFactory struct {
 	// Must return the minimum and maximum supported level.
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@ -60,6 +60,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	b.nBits += encA.nBits + encB.nBits
 }

+// encFourSymbols adds up to 32 bits from four symbols.
+// It will not check if there is space for them,
+// so the caller must ensure that b has been flushed recently.
+func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
+	bitsA := encA.nBits
+	bitsB := bitsA + encB.nBits
+	bitsC := bitsB + encC.nBits
+	bitsD := bitsC + encD.nBits
+	combined := uint64(encA.val) |
+		(uint64(encB.val) << (bitsA & 63)) |
+		(uint64(encC.val) << (bitsB & 63)) |
+		(uint64(encD.val) << (bitsC & 63))
+	b.bitContainer |= combined << (b.nBits & 63)
+	b.nBits += bitsD
+}
+
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
 			tmp := src[n : n+4]
 			// tmp should be len 4
 			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
 		}
 	} else {
 		for ; n >= 0; n -= 4 {
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@ -717,3 +717,11 @@ func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
 func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
 	panic("cvtLZ4BlockSnappyAsm should be unreachable")
 }
+
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockAsm should be unreachable")
+}
+
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockSnappyAsm should be unreachable")
+}
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@ -212,7 +212,17 @@ func matchLen(a []byte, b []byte) int
 //go:noescape
 func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)

-// cvtLZ4Block converts an LZ4 block to S2
+// cvtLZ4sBlock converts an LZ4s block to S2
+//
+//go:noescape
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4Block converts an LZ4 block to Snappy
 //
 //go:noescape
 func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4sBlock converts an LZ4s block to Snappy
+//
+//go:noescape
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@ -19271,6 +19271,491 @@ lz4_s2_dstfull:
 	MOVQ SI, uncompressed+48(FP)
 	RET

+// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4s_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4s_s2_ll_end
+
+lz4s_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ll_loop
+
+lz4s_s2_ll_end:
+	LEAQ  (DX)(R9*1), R8
+	ADDQ  $0x03, R10
+	CMPQ  R8, BX
+	JAE   lz4s_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4s_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4s_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JLT   one_byte_lz4s_s2
+	CMPL  R11, $0x00000100
+	JLT   two_bytes_lz4s_s2
+	CMPL  R11, $0x00010000
+	JLT   three_bytes_lz4s_s2
+	CMPL  R11, $0x01000000
+	JLT   four_bytes_lz4s_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_s2
+
+four_bytes_lz4s_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_s2
+
+three_bytes_lz4s_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_s2
+
+two_bytes_lz4s_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JL   memmove_lz4s_s2
+	JMP  memmove_long_lz4s_s2
+
+one_byte_lz4s_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_lz4s_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4s_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4s_s2:
+	MOVQ R11, AX
+	JMP  lz4s_s2_lits_emit_done
+
+memmove_long_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4s_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4s_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4s_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4s_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_s2_match
+	CMPQ R10, $0x03
+	JEQ  lz4s_s2_done
+	JMP  lz4s_s2_corrupt
+
+lz4s_s2_match:
+	CMPQ    R10, $0x03
+	JEQ     lz4s_s2_loop
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4s_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4s_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4s_s2_corrupt
+	CMPQ    R10, $0x12
+	JNE     lz4s_s2_ml_done
+
+lz4s_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ml_loop
+
+lz4s_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4s_s2_docopy
+
+	// emitRepeat
+emit_repeat_again_lz4_s2:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2
+	CMPL R10, $0x00010100
+	JLT  repeat_four_lz4_s2
+	CMPL R10, $0x0100ffff
+	JLT  repeat_five_lz4_s2
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	MOVL R9, R11
+	SHRL $0x08, R11
+	SHLL $0x05, R11
+	ORL  R11, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JLT  repeat_four_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JLT  repeat_five_lz4_s2_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2_emit_copy_short
+	CMPL R10, $0x00010100
+	JLT  repeat_four_lz4_s2_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JLT  repeat_five_lz4_s2_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JGE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
 // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
 // Requires: SSE2
 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
@ -19536,3 +20021,271 @@ lz4_snappy_dstfull:
 	LEAQ -2(AX), SI
 	MOVQ SI, uncompressed+48(FP)
 	RET
+
+// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -10(AX)(CX*1), CX
+
+lz4s_snappy_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_snappy_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	SHRQ    $0x04, R8
+	ANDQ    $0x0f, R9
+	CMPQ    DI, $0xf0
+	JB      lz4s_snappy_ll_end
+
+lz4s_snappy_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ll_loop
+
+lz4s_snappy_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x03, R9
+	CMPQ  DI, BX
+	JAE   lz4s_snappy_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4s_snappy_lits_done
+	LEAQ  (AX)(R8*1), R10
+	CMPQ  R10, CX
+	JAE   lz4s_snappy_dstfull
+	ADDQ  R8, SI
+	LEAL  -1(R8), R10
+	CMPL  R10, $0x3c
+	JLT   one_byte_lz4s_snappy
+	CMPL  R10, $0x00000100
+	JLT   two_bytes_lz4s_snappy
+	CMPL  R10, $0x00010000
+	JLT   three_bytes_lz4s_snappy
+	CMPL  R10, $0x01000000
+	JLT   four_bytes_lz4s_snappy
+	MOVB  $0xfc, (AX)
+	MOVL  R10, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_snappy
+
+four_bytes_lz4s_snappy:
+	MOVL R10, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW R10, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_snappy
+
+three_bytes_lz4s_snappy:
+	MOVB $0xf4, (AX)
+	MOVW R10, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_snappy
+
+two_bytes_lz4s_snappy:
+	MOVB $0xf0, (AX)
+	MOVB R10, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R10, $0x40
+	JL   memmove_lz4s_snappy
+	JMP  memmove_long_lz4s_snappy
+
+one_byte_lz4s_snappy:
+	SHLB $0x02, R10
+	MOVB R10, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_lz4s_snappy_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8:
+	MOVQ (DX), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
+	MOVQ (DX), R11
+	MOVQ -8(DX)(R8*1), DX
+	MOVQ R11, (AX)
+	MOVQ DX, -8(AX)(R8*1)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4s_snappy:
+	MOVQ R10, AX
+	JMP  lz4s_snappy_lits_emit_done
+
+memmove_long_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R13*1), X4
+	MOVOU -16(DX)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  R10, AX
+
+lz4s_snappy_lits_emit_done:
+	MOVQ DI, DX
+
+lz4s_snappy_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_snappy_match
+	CMPQ R9, $0x03
+	JEQ  lz4s_snappy_done
+	JMP  lz4s_snappy_corrupt
+
+lz4s_snappy_match:
+	CMPQ    R9, $0x03
+	JEQ     lz4s_snappy_loop
+	LEAQ    2(DX), DI
+	CMPQ    DI, BX
+	JAE     lz4s_snappy_corrupt
+	MOVWQZX (DX), R8
+	MOVQ    DI, DX
+	TESTQ   R8, R8
+	JZ      lz4s_snappy_corrupt
+	CMPQ    R8, SI
+	JA      lz4s_snappy_corrupt
+	CMPQ    R9, $0x12
+	JNE     lz4s_snappy_ml_done
+
+lz4s_snappy_ml_loop:
+	MOVBQZX (DX), DI
+	INCQ    DX
+	ADDQ    DI, R9
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ml_loop
+
+lz4s_snappy_ml_done:
+	ADDQ R9, SI
+
+	// emitCopy
+two_byte_offset_lz4_s2:
+	CMPL R9, $0x40
+	JLE  two_byte_offset_short_lz4_s2
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	CMPQ AX, CX
+	JAE  lz4s_snappy_loop
+	JMP  two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+	MOVL R9, DI
+	SHLL $0x02, DI
+	CMPL R9, $0x0c
+	JGE  emit_copy_three_lz4_s2
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_lz4_s2
+	LEAL -15(DI), DI
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_snappy_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_snappy_loop
+
+lz4s_snappy_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_snappy_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_snappy_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
--- a/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
+++ b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
@ -0,0 +1,467 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// LZ4sConverter provides conversion from LZ4s.
+// (Intel modified LZ4 Blocks)
+// https://cdrdv2-public.intel.com/743912/743912-qat-programmers-guide-v2.0.pdf
+// LZ4s is a variant of LZ4 block format. LZ4s should be considered as an intermediate compressed block format.
+// The LZ4s format is selected when the application sets the compType to CPA_DC_LZ4S in CpaDcSessionSetupData.
+// The LZ4s block returned by the Intel® QAT hardware can be used by an external
+// software post-processing to generate other compressed data formats.
+// The following table lists the differences between LZ4 and LZ4s block format. LZ4s block format uses
+// the same high-level formatting as LZ4 block format with the following encoding changes:
+// For Min Match of 4 bytes, Copy length value 1-15 means length 4-18 with 18 bytes adding an extra byte.
+// ONLY "Min match of 4 bytes" is supported.
+type LZ4sConverter struct {
+}
+
+// ConvertBlock will convert an LZ4s block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const inline = true
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var lastOffset uint16
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if offset == lastOffset {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitRepeat16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Repeat offset, make length cheaper
+					length -= 4
+					if length <= 4 {
+						dst[0] = uint8(length)<<2 | tagCopy1
+						dst[1] = 0
+						d += 2
+						break
+					}
+					if length < 8 && offset < 2048 {
+						// Encode WITH offset
+						dst[1] = uint8(offset)
+						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+						d += 2
+						break
+					}
+					if length < (1<<8)+4 {
+						length -= 4
+						dst[2] = uint8(length)
+						dst[1] = 0
+						dst[0] = 5<<2 | tagCopy1
+						d += 3
+						break
+					}
+					if length < (1<<16)+(1<<8) {
+						length -= 1 << 8
+						dst[3] = uint8(length >> 8)
+						dst[2] = uint8(length >> 0)
+						dst[1] = 0
+						dst[0] = 6<<2 | tagCopy1
+						d += 4
+						break
+					}
+					const maxRepeat = (1 << 24) - 1
+					length -= 1 << 16
+					left := 0
+					if length > maxRepeat {
+						left = length - maxRepeat + 4
+						length = maxRepeat - 4
+					}
+					dst[4] = uint8(length >> 16)
+					dst[3] = uint8(length >> 8)
+					dst[2] = uint8(length >> 0)
+					dst[1] = 0
+					dst[0] = 7<<2 | tagCopy1
+					if left > 0 {
+						d += 5 + emitRepeat16(dst[5:], offset, left)
+						break
+					}
+					d += 5
+					break
+				}
+			}
+		} else {
+			if debug {
+				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitCopy16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Offset no more than 2 bytes.
+					if length > 64 {
+						off := 3
+						if offset < 2048 {
+							// emit 8 bytes as tagCopy1, rest as repeats.
+							dst[1] = uint8(offset)
+							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+							length -= 8
+							off = 2
+						} else {
+							// Emit a length 60 copy, encoded as 3 bytes.
+							// Emit remaining as repeat value (minimum 4 bytes).
+							dst[2] = uint8(offset >> 8)
+							dst[1] = uint8(offset)
+							dst[0] = 59<<2 | tagCopy2
+							length -= 60
+						}
+						// Emit remaining as repeats, at least 4 bytes remain.
+						d += off + emitRepeat16(dst[off:], offset, length)
+						break
+					}
+					if length >= 12 || offset >= 2048 {
+						// Emit the remaining copy, encoded as 3 bytes.
+						dst[2] = uint8(offset >> 8)
+						dst[1] = uint8(offset)
+						dst[0] = uint8(length-1)<<2 | tagCopy2
+						d += 3
+						break
+					}
+					// Emit the remaining copy, encoded as 2 bytes.
+					dst[1] = uint8(offset)
+					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+					d += 2
+					break
+				}
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4s block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	// Use assembly when possible
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockSnappyAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return nil, 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if debug {
+			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+		}
+		length := ml
+		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+		for length > 0 {
+			if d >= dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+
+			// Offset no more than 2 bytes.
+			if length > 64 {
+				// Emit a length 64 copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = 63<<2 | tagCopy2
+				length -= 64
+				d += 3
+				continue
+			}
+			if length >= 12 || offset >= 2048 || length < 4 {
+				// Emit the remaining copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = uint8(length-1)<<2 | tagCopy2
+				d += 3
+				break
+			}
+			// Emit the remaining copy, encoded as 2 bytes.
+			dst[d+1] = uint8(offset)
+			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+			d += 2
+			break
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@ -9,6 +9,7 @@ import (
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"
 	"os"
 	"path/filepath"
@ -442,6 +443,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 			}
 		}
 		var err error
+		if debugDecoder {
+			println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
+		}
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@ -54,7 +54,7 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
 func (b *byteBuf) readByte() (byte, error) {
 	bb := *b
 	if len(bb) < 1 {
-		return 0, nil
+		return 0, io.ErrUnexpectedEOF
 	}
 	r := bb[0]
 	*b = bb[1:]
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@ -32,7 +32,6 @@ type match struct {
 	length int32
 	rep    int32
 	est    int32
-	_      [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
 }

 const highScore = 25000
@ -189,12 +188,6 @@ encodeLoop:
 			panic("offset0 was 0")
 		}

-		bestOf := func(a, b *match) *match {
-			if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 {
-				return a
-			}
-			return b
-		}
 		const goodEnough = 100

 		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
@ -202,40 +195,41 @@ encodeLoop:
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]

-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+		// Set m to a match at offset if it looks like that will improve compression.
+		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
 			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{s: s, est: highScore}
+				return
 			}
 			if debugAsserts {
 				if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
 					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
 				}
 			}
-			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-			m.estBits(bitsPerByte)
-			return m
+			cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+			cand.estBits(bitsPerByte)
+			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
+				*m = cand
+			}
 		}

-		m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
-		m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
-		m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
-		m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)
-		best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4))
+		best := match{s: s, est: highScore}
+		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)

 		if canRepeat && best.length < goodEnough {
 			cv32 := uint32(cv >> 8)
 			spp := s + 1
-			m1 := matchAt(spp-offset1, spp, cv32, 1)
-			m2 := matchAt(spp-offset2, spp, cv32, 2)
-			m3 := matchAt(spp-offset3, spp, cv32, 3)
-			best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
+			improve(&best, spp-offset1, spp, cv32, 1)
+			improve(&best, spp-offset2, spp, cv32, 2)
+			improve(&best, spp-offset3, spp, cv32, 3)
 			if best.length > 0 {
 				cv32 = uint32(cv >> 24)
 				spp += 2
-				m1 := matchAt(spp-offset1, spp, cv32, 1)
-				m2 := matchAt(spp-offset2, spp, cv32, 2)
-				m3 := matchAt(spp-offset3, spp, cv32, 3)
-				best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
+				improve(&best, spp-offset1, spp, cv32, 1)
+				improve(&best, spp-offset2, spp, cv32, 2)
+				improve(&best, spp-offset3, spp, cv32, 3)
 			}
 		}
 		// Load next and check...
@ -262,18 +256,16 @@ encodeLoop:
 			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]

 			// Short at s+1
-			m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
 			// Long at s+1, s+2
-			m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
-			m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
-			m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
-			m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
-			best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5))
+			improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
+			improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
 			if false {
 				// Short at s+3.
 				// Too often worse...
-				m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
-				best = bestOf(best, &m)
+				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
 			}
 			// See if we can find a better match by checking where the current best ends.
 			// Use that offset to see if we can find a better full match.
@ -284,13 +276,10 @@ encodeLoop:
 				// For this compression level 2 yields the best results.
 				const skipBeginning = 2
 				if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
-					m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-					bestEnd := bestOf(best, &m)
+					improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
 					if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
-						m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-						bestEnd = bestOf(bestEnd, &m)
+						improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
 					}
-					best = bestEnd
 				}
 			}
 		}
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@ -314,9 +314,6 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			if size-startSize == 424242 {
-				panic("here")
-			}
 			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
 		if size > cap(out) {
@ -427,8 +424,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 		}
 	}

-	// Check if space for literals
-	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+	if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
 		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}

--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@ -148,7 +148,6 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
 		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-
 	}
 	err := br.close()
 	if err != nil {
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -335,7 +335,7 @@ github.com/jmespath/go-jmespath
 # github.com/jpillora/backoff v1.0.0
 ## explicit; go 1.13
 github.com/jpillora/backoff
-# github.com/klauspost/compress v1.16.0
+# github.com/klauspost/compress v1.16.3
 ## explicit; go 1.18
 github.com/klauspost/compress
 github.com/klauspost/compress/flate