app/vmagent/remotewrite: add benchmarks for comparing the performance of standard Snappy encoder with github.com/klauspost/compress/s2 encoder

The standard Snappy encoder from github.com/golang/snappy shows quite good performance number for compressing the Prometheus remote_write proto messages according to the added benchmarks, so there is no need in switching to github.com/klauspost/compress/s2 yet.
2024-11-21 14:44:00 +00:00 · 2022-09-19 14:26:18 +03:00 · 2022-09-19 14:26:18 +03:00 · 2b55d167d7
commit 2b55d167d7
parent b4410b1c63
23 changed files with 25601 additions and 4 deletions
--- a/app/vmagent/remotewrite/pendingseries_test.go
+++ b/app/vmagent/remotewrite/pendingseries_test.go
@ -0,0 +1,62 @@
 package remotewrite
 import (
 	"fmt"
 	"testing"
 	"github.com/golang/snappy"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 )
 func TestPushWriteRequest(t *testing.T) {
 	for _, rowsCount := range []int{1, 10, 100, 1e3, 1e4} {
 		t.Run(fmt.Sprintf("%d", rowsCount), func(t *testing.T) {
 			testPushWriteRequest(t, rowsCount)
 		})
 	}
 }
 func testPushWriteRequest(t *testing.T, rowsCount int) {
 	wr := newTestWriteRequest(rowsCount, 10)
 	pushBlockLen := 0
 	pushBlock := func(block []byte) {
 		if pushBlockLen > 0 {
 			panic(fmt.Errorf("BUG: pushBlock called multiple times; pushBlockLen=%d at first call, len(block)=%d at second call", pushBlockLen, len(block)))
 		}
 		pushBlockLen = len(block)
 	}
 	pushWriteRequest(wr, pushBlock)
 	b := prompbmarshal.MarshalWriteRequest(nil, wr)
 	zb := snappy.Encode(nil, b)
 	maxPushBlockLen := len(zb)
 	minPushBlockLen := maxPushBlockLen / 2
 	if pushBlockLen < minPushBlockLen {
 		t.Fatalf("unexpected block len after pushWriteRequest; got %d bytes; must be at least %d bytes", pushBlockLen, minPushBlockLen)
 	}
 	if pushBlockLen > maxPushBlockLen {
 		t.Fatalf("unexpected block len after pushWriteRequest; got %d bytes; must be smaller or equal to %d bytes", pushBlockLen, maxPushBlockLen)
 	}
 }
 func newTestWriteRequest(seriesCount, labelsCount int) *prompbmarshal.WriteRequest {
 	var wr prompbmarshal.WriteRequest
 	for i := 0; i < seriesCount; i++ {
 		var labels []prompbmarshal.Label
 		for j := 0; j < labelsCount; j++ {
 			labels = append(labels, prompbmarshal.Label{
 				Name: fmt.Sprintf("label_%d_%d", i, j),
 				Value: fmt.Sprintf("value_%d_%d", i, j),
 			})
 		}
 		wr.Timeseries = append(wr.Timeseries, prompbmarshal.TimeSeries{
 			Labels: labels,
 			Samples: []prompbmarshal.Sample{
 				{
 					Value: float64(i),
 					Timestamp: 1000*int64(i),
 				},
 			},
 		})
 	}
 	return &wr
 }
--- a/app/vmagent/remotewrite/pendingseries_timing_test.go
+++ b/app/vmagent/remotewrite/pendingseries_timing_test.go
@ -0,0 +1,36 @@
 package remotewrite
 import (
 	"fmt"
 	"testing"
 	"github.com/golang/snappy"
 	"github.com/klauspost/compress/s2"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 )
 func BenchmarkCompressWriteRequestSnappy(b *testing.B) {
 	b.Run("snappy", func(b *testing.B) {
 		benchmarkCompressWriteRequest(b, snappy.Encode)
 	})
 	b.Run("s2", func(b *testing.B) {
 		benchmarkCompressWriteRequest(b, s2.EncodeSnappy)
 	})
 }
 func benchmarkCompressWriteRequest(b *testing.B, compressFunc func(dst, src []byte) []byte) {
 	for _, rowsCount := range []int{1, 10, 100, 1e3, 1e4} {
 		b.Run(fmt.Sprintf("rows_%d", rowsCount), func(b *testing.B) {
 			wr := newTestWriteRequest(rowsCount, 10)
 			data := prompbmarshal.MarshalWriteRequest(nil, wr)
 			b.ReportAllocs()
 			b.SetBytes(int64(rowsCount))
 			b.RunParallel(func(pb *testing.PB) {
 				var zb []byte
 				for pb.Next() {
 					zb = compressFunc(zb[:cap(zb)], data)
 				}
 			})
 		})
 	}
 }
--- a/go.mod
+++ b/go.mod
@ -4,7 +4,7 @@ go 1.19
 require (
 	cloud.google.com/go/storage v1.26.0
-	github.com/VictoriaMetrics/fastcache v1.10.0
+	github.com/VictoriaMetrics/fastcache v1.12.0
 	// Do not use the original github.com/valyala/fasthttp because of issues
 	// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
--- a/go.sum
+++ b/go.sum
@ -104,8 +104,8 @@ github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdko
 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
 github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
 github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
-github.com/VictoriaMetrics/fastcache v1.10.0 h1:5hDJnLsKLpnUEToub7ETuRu8RCkb40woBZAUiKonXzY=
+github.com/VictoriaMetrics/fastcache v1.12.0 h1:vnVi/y9yKDcD9akmc4NqAoqgQhJrOwUF+j9LTgn4QDE=
-github.com/VictoriaMetrics/fastcache v1.10.0/go.mod h1:tjiYeEfYXCqacuvYw/7UoDIeJaNxq6132xHICNP77w8=
+github.com/VictoriaMetrics/fastcache v1.12.0/go.mod h1:tjiYeEfYXCqacuvYw/7UoDIeJaNxq6132xHICNP77w8=
 github.com/VictoriaMetrics/fasthttp v1.1.0 h1:3crd4YWHsMwu60GUXRH6OstowiFvqrwS4a/ueoLdLL0=
 github.com/VictoriaMetrics/fasthttp v1.1.0/go.mod h1:/7DMcogqd+aaD3G3Hg5kFgoFwlR2uydjiWvoLp5ZTqQ=
 github.com/VictoriaMetrics/metrics v1.18.1/go.mod h1:ArjwVz7WpgpegX/JpB0zpNF2h2232kErkEnzH1sxMmA=
--- a/vendor/github.com/klauspost/compress/s2/.gitignore
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@ -0,0 +1,15 @@
 testdata/bench
 # These explicitly listed benchmark data files are for an obsolete version of
 # snappy_test.go.
 testdata/alice29.txt
 testdata/asyoulik.txt
 testdata/fireworks.jpeg
 testdata/geo.protodata
 testdata/html
 testdata/html_x_4
 testdata/kppkn.gtb
 testdata/lcet10.txt
 testdata/paper-100k.pdf
 testdata/plrabn12.txt
 testdata/urls.10K
--- a/vendor/github.com/klauspost/compress/s2/LICENSE
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@ -0,0 +1,28 @@
 Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
 Copyright (c) 2019 Klaus Post. All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
   * Neither the name of Google Inc. nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@ -0,0 +1,965 @@
 # S2 Compression
 S2 is an extension of [Snappy](https://github.com/google/snappy).
 S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
 Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
 This means that S2 can seamlessly replace Snappy without converting compressed content.
 S2 can produce Snappy compatible output, faster and better than Snappy.
 If you want full benefit of the changes you should use s2 without Snappy compatibility. 
 S2 is designed to have high throughput on content that cannot be compressed.
 This is important, so you don't have to worry about spending CPU cycles on already compressed data. 
 ## Benefits over Snappy
 * Better compression
 * Adjustable compression (3 levels) 
 * Concurrent stream compression
 * Faster decompression, even for Snappy compatible content
 * Concurrent Snappy/S2 stream decompression
 * Ability to quickly skip forward in compressed stream
 * Random seeking with indexes
 * Compatible with reading Snappy compressed content
 * Smaller block size overhead on incompressible blocks
 * Block concatenation
 * Uncompressed stream mode
 * Automatic stream size padding
 * Snappy compatible block compression
 ## Drawbacks over Snappy
 * Not optimized for 32 bit systems
 * Streams use slightly more memory due to larger blocks and concurrency (configurable)
 # Usage
 Installation: `go get -u github.com/klauspost/compress/s2`
 Full package documentation:
 [![godoc][1]][2]
 [1]: https://godoc.org/github.com/klauspost/compress?status.svg
 [2]: https://godoc.org/github.com/klauspost/compress/s2
 ## Compression
 ```Go
 func EncodeStream(src io.Reader, dst io.Writer) error {
    enc := s2.NewWriter(dst)
    _, err := io.Copy(enc, src)
    if err != nil {
        enc.Close()
        return err
    }
    // Blocks until compression is done.
    return enc.Close() 
 }
 ```
 You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
 For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
 The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
 It is possible to flush any buffered data using the `Flush()` method. 
 This will block until all data sent to the encoder has been written to the output.
 S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
 As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
 a slightly more efficient method is to use the `EncodeBuffer` method.
 This will take ownership of the buffer until the stream is closed.
 ```Go
 func EncodeStream(src []byte, dst io.Writer) error {
    enc := s2.NewWriter(dst)
    // The encoder owns the buffer until Flush or Close is called.
    err := enc.EncodeBuffer(buf)
    if err != nil {
        enc.Close()
        return err
    }
    // Blocks until compression is done.
    return enc.Close()
 }
 ```
 Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, 
 so it should only be used a single time per stream.
 If you need to write several blocks, you should use the regular io.Writer interface.
 ## Decompression
 ```Go
 func DecodeStream(src io.Reader, dst io.Writer) error {
    dec := s2.NewReader(src)
    _, err := io.Copy(dst, dec)
    return err
 }
 ```
 Similar to the Writer, a Reader can be reused using the `Reset` method.
 For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
 However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.  
 For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
 Do however note that these functions (similar to Snappy) does not provide validation of data, 
 so data corruption may be undetected. Stream encoding provides CRC checks of data.
 It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. 
 For big skips the decompressor is able to skip blocks without decompressing them.
 ## Single Blocks
 Similar to Snappy S2 offers single block compression. 
 Blocks do not offer the same flexibility and safety as streams,
 but may be preferable for very small payloads, less than 100K.
 Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. 
 It is possible to provide a destination buffer. 
 If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. 
 If not a new will be allocated. 
 Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
 Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. 
 Again an optional destination buffer can be supplied. 
 The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. 
 If that is not satisfied a new buffer will be allocated.
 Block function always operate on a single goroutine since it should only be used for small payloads.
 # Commandline tools
 Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
 Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
 Installing then requires Go to be installed. To install them, use:
 `go install github.com/klauspost/compress/s2/cmd/s2c@latest && go install github.com/klauspost/compress/s2/cmd/s2d@latest`
 To build binaries to the current folder use:
 `go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
 ## s2c
 ```
 Usage: s2c [options] file1 file2
 Compresses all files supplied as input separately.
 Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
 By default output files will be overwritten.
 Use - as the only file name to read from stdin and write to stdout.
 Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
 Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
 File names beginning with 'http://' and 'https://' will be downloaded and compressed.
 Only http response code 200 is accepted.
 Options:
  -bench int
    	Run benchmark n times. No output will be written
  -blocksize string
    	Max  block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
  -c	Write all output to stdout. Multiple input files will be concatenated
  -cpu int
    	Compress using this amount of threads (default 32)
  -faster
    	Compress faster, but with a minor compression loss
  -help
    	Display help
  -index
        Add seek index (default true)    	
  -o string
        Write output to another file. Single input file only
  -pad string
    	Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
  -q	Don't write any output to terminal, except errors
  -rm
    	Delete source file(s) after successful compression
  -safe
    	Do not overwrite output files
  -slower
    	Compress more, but a lot slower
  -snappy
        Generate Snappy compatible output stream
  -verify
    	Verify written files  
 ```
 ## s2d
 ```
 Usage: s2d [options] file1 file2
 Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
 Output file names have the extension removed. By default output files will be overwritten.
 Use - as the only file name to read from stdin and write to stdout.
 Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
 Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
 File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
 Extensions on downloaded files are ignored. Only http response code 200 is accepted.
 Options:
  -bench int
    	Run benchmark n times. No output will be written
  -c	Write all output to stdout. Multiple input files will be concatenated
  -help
    	Display help
  -o string
        Write output to another file. Single input file only
  -offset string
        Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
  -q    Don't write any output to terminal, except errors
  -rm
        Delete source file(s) after successful decompression
  -safe
        Do not overwrite output files
  -tail string
        Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
  -verify
    	Verify files, but do not write output                                      
 ```
 ## s2sx: self-extracting archives
 s2sx allows creating self-extracting archives with no dependencies.
 By default, executables are created for the same platforms as the host os, 
 but this can be overridden with `-os` and `-arch` parameters.
 Extracted files have 0666 permissions, except when untar option used.
 ```
 Usage: s2sx [options] file1 file2
 Compresses all files supplied as input separately.
 If files have '.s2' extension they are assumed to be compressed already.
 Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
 If output is big, an additional file with ".more" is written. This must be included as well.
 By default output files will be overwritten.
 Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
 Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
 Options:
  -arch string
        Destination architecture (default "amd64")
  -c    Write all output to stdout. Multiple input files will be concatenated
  -cpu int
        Compress using this amount of threads (default 32)
  -help
        Display help
  -max string
        Maximum executable size. Rest will be written to another file. (default "1G")
  -os string
        Destination operating system (default "windows")
  -q    Don't write any output to terminal, except errors
  -rm
        Delete source file(s) after successful compression
  -safe
        Do not overwrite output files
  -untar
        Untar on destination
 ```
 Available platforms are:
 * darwin-amd64
 * darwin-arm64
 * linux-amd64
 * linux-arm
 * linux-arm64
 * linux-mips64
 * linux-ppc64le
 * windows-386
 * windows-amd64                                                                             
 By default, there is a size limit of 1GB for the output executable.
 When this is exceeded the remaining file content is written to a file called
 output+`.more`. This file must be included for a successful extraction and 
 placed alongside the executable for a successful extraction.
 This file *must* have the same name as the executable, so if the executable is renamed, 
 so must the `.more` file. 
 This functionality is disabled with stdin/stdout. 
 ### Self-extracting TAR files
 If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
 Files are extracted to the current folder with the path specified in the tar file.
 Note that tar files are not validated before they are wrapped.
 For security reasons files that move below the root folder are not allowed.
 # Performance
 This section will focus on comparisons to Snappy. 
 This package is solely aimed at replacing Snappy as a high speed compression package.
 If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
 gives better compression, but typically at speeds slightly below "better" mode in this package.
 Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
 Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
 A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
 The content compressed in this mode is fully compatible with the standard decoder.
 Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
 | File                                                                                                | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
 |-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
 | [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 12.70x   | 10556 MB/s    | 7.35%        | 4.15x       | 3455 MB/s           | 12.79%             |
 | (1 CPU)                                                                                             | 1.14x    | 948 MB/s      | -            | 0.42x       | 349 MB/s            | -                  |
 | [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x   | 14484 MB/s    | 31.60%       | 10.09x      | 8533 MB/s           | 37.71%             |
 | (1 CPU)                                                                                             | 1.33x    | 1127 MB/s     | -            | 0.70x       | 589 MB/s            | -                  |
 | [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 15.14x   | 12000 MB/s    | -5.79%       | 6.59x       | 5223 MB/s           | 5.80%              |
 | (1 CPU)                                                                                             | 1.11x    | 877 MB/s      | -            | 0.47x       | 370 MB/s            | -                  |
 | [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 14.62x   | 12116 MB/s    | 15.90%       | 5.35x       | 4430 MB/s           | 16.08%             |
 | (1 CPU)                                                                                             | 1.38x    | 1146 MB/s     | -            | 0.38x       | 312 MB/s            | -                  |
 | [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 8.83x    | 17579 MB/s    | 43.86%       | 6.54x       | 13011 MB/s          | 47.23%             |
 | (1 CPU)                                                                                             | 1.14x    | 2259 MB/s     | -            | 0.74x       | 1475 MB/s           | -                  |
 | [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 16.72x   | 14019 MB/s    | 24.02%       | 10.11x      | 8477 MB/s           | 30.48%             |
 | (1 CPU)                                                                                             | 1.24x    | 1043 MB/s     | -            | 0.70x       | 586 MB/s            | -                  |
 | [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 13.33x   | 9254 MB/s     | 1.84%        | 6.75x       | 4686 MB/s           | 6.72%              |
 | (1 CPU)                                                                                             | 0.97x    | 672 MB/s      | -            | 0.53x       | 366 MB/s            | -                  |
 | sharnd.out.2gb                                                                                      | 2.11x    | 12639 MB/s    | 0.01%        | 1.98x       | 11833 MB/s          | 0.01%              |
 | (1 CPU)                                                                                             | 0.93x    | 5594 MB/s     | -            | 1.34x       | 8030 MB/s           | -                  |
 | [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 19.34x   | 8220 MB/s     | 3.98%        | 7.87x       | 3345 MB/s           | 15.82%             |
 | (1 CPU)                                                                                             | 1.06x    | 452 MB/s      | -            | 0.50x       | 213 MB/s            | -                  |
 | [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 10.48x   | 6124 MB/s     | 5.67%        | 3.76x       | 2197 MB/s           | 12.60%             |
 | (1 CPU)                                                                                             | 0.97x    | 568 MB/s      | -            | 0.46x       | 271 MB/s            | -                  |
 | [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 21.07x   | 9020 MB/s     | 6.36%        | 6.91x       | 2959 MB/s           | 16.95%             |
 | (1 CPU)                                                                                             | 1.07x    | 460 MB/s      | -            | 0.51x       | 220 MB/s            | -                  |
 ### Legend
 * `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
 * `S2 throughput`: Throughput of S2 in MB/s. 
 * `S2 % smaller`: How many percent of the Snappy output size is S2 better.
 * `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
 * `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
 * `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
 There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
 Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
 The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
 Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. 
 This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). 
 ## Decompression
 S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
 S2 vs Snappy **decompression** speed. Both operating on single core:
 | File                                                                                                | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
 |-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
 | [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 2117 MB/s     | 1.14x      | 1738 MB/s         | 0.94x      |
 | [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s     | 1.25x      | 2307 MB/s         | 1.20x      |
 | [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 2075 MB/s     | 0.98x      | 1764 MB/s         | 0.83x      |
 | [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 2967 MB/s     | 1.05x      | 2885 MB/s         | 1.02x      |
 | [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 4141 MB/s     | 1.07x      | 4184 MB/s         | 1.08x      |
 | [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 2264 MB/s     | 1.12x      | 2185 MB/s         | 1.08x      |
 | [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 1525 MB/s     | 1.03x      | 1347 MB/s         | 0.91x      |
 | sharnd.out.2gb                                                                                      | 3813 MB/s     | 0.79x      | 3900 MB/s         | 0.81x      |
 | [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 1246 MB/s     | 1.29x      | 967 MB/s          | 1.00x      |
 | [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 1433 MB/s     | 1.12x      | 1203 MB/s         | 0.94x      |
 | [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 1284 MB/s     | 1.32x      | 1010 MB/s         | 1.04x      |
 ### Legend
 * `S2 Throughput`: Decompression speed of S2 encoded content.
 * `Better Throughput`: Decompression speed of S2 "better" encoded content.
 * `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
 While the decompression code hasn't changed, there is a significant speedup in decompression speed. 
 S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. 
 While this reduces compression a bit, it improves decompression speed.
 The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.   
 Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
 | File                           | S2 Throughput | S2 throughput |
 |--------------------------------|--------------|---------------|
 | consensus.db.10gb.s2           | 1.84x        | 2289.8 MB/s   |
 | 10gb.tar.s2                    | 1.30x        | 867.07 MB/s   |
 | rawstudio-mint14.tar.s2        | 1.66x        | 1329.65 MB/s  |
 | github-june-2days-2019.json.s2 | 2.36x        | 1831.59 MB/s  |
 | github-ranks-backup.bin.s2     | 1.73x        | 1390.7 MB/s   |
 | enwik9.s2                      | 1.67x        | 681.53 MB/s   |
 | adresser.json.s2               | 3.41x        | 4230.53 MB/s  |
 | silesia.tar.s2                 | 1.52x        | 811.58        |
 Even though S2 typically compresses better than Snappy, decompression speed is always better. 
 ### Concurrent Stream Decompression
 For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) 
 that will decode a full stream using multiple goroutines.
 Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3: 
 | Input                                     | `-cpu=1`   | `-cpu=2`   | `-cpu=4`   | `-cpu=8`   | `-cpu=16`   |
 |-------------------------------------------|------------|------------|------------|------------|-------------|
 | enwik10.snappy                            | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
 | enwik10.s2                                | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s  |
 | sofia-air-quality-dataset.tar.snappy      | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
 | sofia-air-quality-dataset.tar.s2          | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
 | sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s  | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s  |
 Scaling can be expected to be pretty linear until memory bandwidth is saturated. 
 For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
 ## Block compression
 When compressing blocks no concurrent compression is performed just as Snappy. 
 This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
 An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
 In rare, worst case scenario Snappy blocks could be significantly bigger than the input.  
 ### Mixed content blocks
 The most reliable is a wide dataset. 
 For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
 53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
 | *                 | Input      | Output     | Reduction | MB/s   |
 |-------------------|------------|------------|-----------|--------|
 | S2                | 4014735833 | 1059723369 | 73.60%    | **934.34** |
 | S2 Better         | 4014735833 | 969670507  | 75.85%    | 532.70 |
 | S2 Best           | 4014735833 | 906625668  | **77.85%** | 46.84 |
 | Snappy            | 4014735833 | 1128706759 | 71.89%    | 762.59 |
 | S2, Snappy Output | 4014735833 | 1093821420 | 72.75%    | 908.60 |
 | LZ4               | 4014735833 | 1079259294 | 73.12%    | 526.94 |
 S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
 "Better" mode provides the same compression speed as LZ4 with better compression ratio. 
 When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
 As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
 Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
 other Go compressors:
 | *                 | Input      | Output     | Reduction | MB/s   |
 |-------------------|------------|------------|-----------|--------|
 | Zstd Fastest (Go) | 4014735833 | 794608518  | 80.21%    | 236.04 |
 | Zstd Best (Go)    | 4014735833 | 704603356  | 82.45%    | 35.63  |
 | Deflate (Go) l1   | 4014735833 | 871294239  | 78.30%    | 214.04 |
 | Deflate (Go) l9   | 4014735833 | 730389060  | 81.81%    | 41.17  |
 ### Standard block compression
 Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
 So individual benchmarks should only be seen as a guideline and the overall picture is more important.
 These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. 
 Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
 AMD64 assembly is use for both S2 and Snappy.
 | Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |
 |-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
 | html                  | 22843       | 21111   | 16246 MB/s   | 17438 MB/s  | 40972 MB/s  | 49263 MB/s  |
 | urls.10K              | 335492      | 287326  | 7943 MB/s    | 9693 MB/s   | 22523 MB/s  | 26484 MB/s  |
 | fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 273889 MB/s | 718321 MB/s | 827552 MB/s |
 | fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 17773 MB/s  | 33691 MB/s  | 52421 MB/s  |
 | paper-100k.pdf        | 85304       | 84459   | 167546 MB/s  | 101263 MB/s | 326905 MB/s | 291944 MB/s |
 | html_x_4              | 92234       | 21113   | 15194 MB/s   | 50670 MB/s  | 30843 MB/s  | 32217 MB/s  |
 | alice29.txt           | 88034       | 85975   | 5936 MB/s    | 6139 MB/s   | 12882 MB/s  | 20044 MB/s  |
 | asyoulik.txt          | 77503       | 79650   | 5517 MB/s    | 6366 MB/s   | 12735 MB/s  | 22806 MB/s  |
 | lcet10.txt            | 234661      | 220670  | 6235 MB/s    | 6067 MB/s   | 14519 MB/s  | 18697 MB/s  |
 | plrabn12.txt          | 319267      | 317985  | 5159 MB/s    | 5726 MB/s   | 11923 MB/s  | 19901 MB/s  |
 | geo.protodata         | 23335       | 18690   | 21220 MB/s   | 26529 MB/s  | 56271 MB/s  | 62540 MB/s  |
 | kppkn.gtb             | 69526       | 65312   | 9732 MB/s    | 8559 MB/s   | 18491 MB/s  | 18969 MB/s  |
 | alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 15489 MB/s  | 31883 MB/s  | 38874 MB/s  |
 | alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13000 MB/s  | 48056 MB/s  | 52341 MB/s  |
 | alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12806 MB/s  | 32378 MB/s  | 46322 MB/s  |
 | alice29.txt (20000B)  | 12686       | 13574   | 7733 MB/s    | 11210 MB/s  | 30566 MB/s  | 58969 MB/s  |
 | Relative Perf         | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
 |-----------------------|-------------|------------------|----------|--------------|
 | html                  | 22.31%      | 7.58%            | 1.07x    | 1.20x        |
 | urls.10K              | 47.78%      | 14.36%           | 1.22x    | 1.18x        |
 | fireworks.jpeg        | 99.95%      | -0.05%           | 0.78x    | 1.15x        |
 | fireworks.jpeg (200B) | 73.00%      | -6.16%           | 2.00x    | 1.56x        |
 | paper-100k.pdf        | 83.30%      | 0.99%            | 0.60x    | 0.89x        |
 | html_x_4              | 22.52%      | 77.11%           | 3.33x    | 1.04x        |
 | alice29.txt           | 57.88%      | 2.34%            | 1.03x    | 1.56x        |
 | asyoulik.txt          | 61.91%      | -2.77%           | 1.15x    | 1.79x        |
 | lcet10.txt            | 54.99%      | 5.96%            | 0.97x    | 1.29x        |
 | plrabn12.txt          | 66.26%      | 0.40%            | 1.11x    | 1.67x        |
 | geo.protodata         | 19.68%      | 19.91%           | 1.25x    | 1.11x        |
 | kppkn.gtb             | 37.72%      | 6.06%            | 0.88x    | 1.03x        |
 | alice29.txt (128B)    | 62.50%      | -2.50%           | 2.31x    | 1.22x        |
 | alice29.txt (1000B)   | 77.40%      | 0.00%            | 1.07x    | 1.09x        |
 | alice29.txt (10000B)  | 66.48%      | -4.29%           | 1.27x    | 1.43x        |
 | alice29.txt (20000B)  | 63.43%      | -7.00%           | 1.45x    | 1.93x        |
 Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. 
 Decompression speed is better than Snappy, except in one case. 
 Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
 Size is on average around Snappy, but varies on content type. 
 In cases where compression is worse, it usually is compensated by a speed boost. 
 ### Better compression
 Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
 So individual benchmarks should only be seen as a guideline and the overall picture is more important.
 | Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |
 |-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
 | html                  | 22843       | 19833       | 16246 MB/s   | 7731 MB/s    | 40972 MB/s  | 40292 MB/s  |
 | urls.10K              | 335492      | 253529      | 7943 MB/s    | 3980 MB/s    | 22523 MB/s  | 20981 MB/s  |
 | fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 9760 MB/s    | 718321 MB/s | 823698 MB/s |
 | fireworks.jpeg (200B) | 146         | 142         | 8869 MB/s    | 594 MB/s     | 33691 MB/s  | 30101 MB/s  |
 | paper-100k.pdf        | 85304       | 82915       | 167546 MB/s  | 7470 MB/s    | 326905 MB/s | 198869 MB/s |
 | html_x_4              | 92234       | 19841       | 15194 MB/s   | 23403 MB/s   | 30843 MB/s  | 30937 MB/s  |
 | alice29.txt           | 88034       | 73218       | 5936 MB/s    | 2945 MB/s    | 12882 MB/s  | 16611 MB/s  |
 | asyoulik.txt          | 77503       | 66844       | 5517 MB/s    | 2739 MB/s    | 12735 MB/s  | 14975 MB/s  |
 | lcet10.txt            | 234661      | 190589      | 6235 MB/s    | 3099 MB/s    | 14519 MB/s  | 16634 MB/s  |
 | plrabn12.txt          | 319267      | 270828      | 5159 MB/s    | 2600 MB/s    | 11923 MB/s  | 13382 MB/s  |
 | geo.protodata         | 23335       | 18278       | 21220 MB/s   | 11208 MB/s   | 56271 MB/s  | 57961 MB/s  |
 | kppkn.gtb             | 69526       | 61851       | 9732 MB/s    | 4556 MB/s    | 18491 MB/s  | 16524 MB/s  |
 | alice29.txt (128B)    | 80          | 81          | 6691 MB/s    | 529 MB/s     | 31883 MB/s  | 34225 MB/s  |
 | alice29.txt (1000B)   | 774         | 748         | 12204 MB/s   | 1943 MB/s    | 48056 MB/s  | 42068 MB/s  |
 | alice29.txt (10000B)  | 6648        | 6234        | 10044 MB/s   | 2949 MB/s    | 32378 MB/s  | 28813 MB/s  |
 | alice29.txt (20000B)  | 12686       | 11584       | 7733 MB/s    | 2822 MB/s    | 30566 MB/s  | 27315 MB/s  |
 | Relative Perf         | Snappy size | Better size | Better Speed | Better dec |
 |-----------------------|-------------|-------------|--------------|------------|
 | html                  | 22.31%      | 13.18%      | 0.48x        | 0.98x      |
 | urls.10K              | 47.78%      | 24.43%      | 0.50x        | 0.93x      |
 | fireworks.jpeg        | 99.95%      | -0.05%      | 0.03x        | 1.15x      |
 | fireworks.jpeg (200B) | 73.00%      | 2.74%       | 0.07x        | 0.89x      |
 | paper-100k.pdf        | 83.30%      | 2.80%       | 0.07x        | 0.61x      |
 | html_x_4              | 22.52%      | 78.49%      | 0.04x        | 1.00x      |
 | alice29.txt           | 57.88%      | 16.83%      | 1.54x        | 1.29x      |
 | asyoulik.txt          | 61.91%      | 13.75%      | 0.50x        | 1.18x      |
 | lcet10.txt            | 54.99%      | 18.78%      | 0.50x        | 1.15x      |
 | plrabn12.txt          | 66.26%      | 15.17%      | 0.50x        | 1.12x      |
 | geo.protodata         | 19.68%      | 21.67%      | 0.50x        | 1.03x      |
 | kppkn.gtb             | 37.72%      | 11.04%      | 0.53x        | 0.89x      |
 | alice29.txt (128B)    | 62.50%      | -1.25%      | 0.47x        | 1.07x      |
 | alice29.txt (1000B)   | 77.40%      | 3.36%       | 0.08x        | 0.88x      |
 | alice29.txt (10000B)  | 66.48%      | 6.23%       | 0.16x        | 0.89x      |
 | alice29.txt (20000B)  | 63.43%      | 8.69%       | 0.29x        | 0.89x      |
 Except for the mostly incompressible JPEG image compression is better and usually in the 
 double digits in terms of percentage reduction over Snappy.
 The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder 
 to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
 This mode aims to provide better compression at the expense of performance and achieves that 
 without a huge performance penalty, except on very small blocks. 
 Decompression speed suffers a little compared to the regular S2 mode, 
 but still manages to be close to Snappy in spite of increased compression.  
 # Best compression mode
 S2 offers a "best" compression mode. 
 This will compress as much as possible with little regard to CPU usage.
 Mainly for offline compression, but where decompression speed should still
 be high and compatible with other S2 compressed data.
 Some examples compared on 16 core CPU, amd64 assembly used:
 ```
 * enwik10
 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
 Better...  10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
 Best...    10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
 * github-june-2days-2019.json
 Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
 Better...  6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
 Best...    6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
 * nyc-taxi-data-10M.csv
 Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
 Better...  3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
 Best...    3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
 * 10gb.tar
 Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
 Better...  10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
 Best...    10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
 * consensus.db.10gb
 Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
 Better...  10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
 Best...    10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
 ```
 Decompression speed should be around the same as using the 'better' compression mode. 
 # Snappy Compatibility
 S2 now offers full compatibility with Snappy.
 This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
 There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
 simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
 This uses "better" mode for all operations.
 If you would like more control, you can use the s2 package as described below: 
 ## Blocks
 Snappy compatible blocks can be generated with the S2 encoder. 
 Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace 
 | Snappy                     | S2 replacement          |
 |----------------------------|-------------------------|
 | snappy.Encode(...)         | s2.EncodeSnappy(...)   |
 | snappy.MaxEncodedLen(...)  | s2.MaxEncodedLen(...)   |
 `s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. 
 `s2.ConcatBlocks` is compatible with snappy blocks.
 Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
 53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
 | Encoder               | Size       | MB/s       | Reduction |
 |-----------------------|------------|------------|------------
 | snappy.Encode         | 1128706759 | 725.59     | 71.89%    |
 | s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%    |
 | s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%    |
 | s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%**|
 ## Streams
 For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
 All other options are available, but note that block size limit is different for snappy.
 Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: 
 | File                        | snappy.NewWriter         | S2 Snappy                 | S2 Snappy, Better        | S2 Snappy, Best         |
 |-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
 | nyc-taxi-data-10M.csv       | 1316042016 - 539.47MB/s  | 1307003093 - 10132.73MB/s | 1174534014 - 5002.44MB/s | 1115904679 - 177.97MB/s |
 | enwik10 (xml)               | 5088294643 - 451.13MB/s  | 5175840939 -  9440.69MB/s | 4560784526 - 4487.21MB/s | 4340299103 - 158.92MB/s |
 | 10gb.tar (mixed)            | 6056946612 - 729.73MB/s  | 6208571995 -  9978.05MB/s | 5741646126 - 4919.98MB/s | 5548973895 - 180.44MB/s |
 | github-june-2days-2019.json | 1525176492 - 933.00MB/s  | 1476519054 - 13150.12MB/s | 1400547532 - 5803.40MB/s | 1321887137 - 204.29MB/s |
 | consensus.db.10gb (db)      | 5412897703 - 1102.14MB/s | 5354073487 - 13562.91MB/s | 5335069899 - 5294.73MB/s | 5201000954 - 175.72MB/s |
 # Decompression
 All decompression functions map directly to equivalent s2 functions.
 | Snappy                 | S2 replacement     |
 |------------------------|--------------------|
 | snappy.Decode(...)     | s2.Decode(...)     |
 | snappy.DecodedLen(...) | s2.DecodedLen(...) |
 | snappy.NewReader(...)  | s2.NewReader(...)  |
 Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
 are also available for Snappy streams.
 If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
 on your Reader will reduce memory consumption.
 # Concatenating blocks and streams.
 Concatenating streams will concatenate the output of both without recompressing them. 
 While this is inefficient in terms of compression it might be usable in certain scenarios. 
 The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
 Blocks can be concatenated using the `ConcatBlocks` function.
 Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
 Streams with indexes (see below) will currently not work on concatenated streams.
 # Stream Seek Index
 S2 and Snappy streams can have indexes. These indexes will allow random seeking within the compressed data.
 The index can either be appended to the stream as a skippable block or returned for separate storage.
 When the index is appended to a stream it will be skipped by regular decoders, 
 so the output remains compatible with other decoders. 
 ## Creating an Index
 To automatically add an index to a stream, add `WriterAddIndex()` option to your writer.
 Then the index will be added to the stream when `Close()` is called.
 ```
 	// Add Index to stream...
 	enc := s2.NewWriter(w, s2.WriterAddIndex())
 	io.Copy(enc, r)
 	enc.Close()
 ```
 If you want to store the index separately, you can use `CloseIndex()` instead of the regular `Close()`.
 This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`.
 ```
 	// Get index for separate storage... 
 	enc := s2.NewWriter(w)
 	io.Copy(enc, r)
 	index, err := enc.CloseIndex()
 ```
 The `index` can then be used needing to read from the stream. 
 This means the index can be used without needing to seek to the end of the stream 
 or for manually forwarding streams. See below.
 Finally, an existing S2/Snappy stream can be indexed using the `s2.IndexStream(r io.Reader)` function.
 ## Using Indexes
 To use indexes there is a `ReadSeeker(random bool, index []byte) (*ReadSeeker, error)` function available.
 Calling ReadSeeker will return an [io.ReadSeeker](https://pkg.go.dev/io#ReadSeeker) compatible version of the reader.
 If 'random' is specified the returned io.Seeker can be used for random seeking, otherwise only forward seeking is supported.
 Enabling random seeking requires the original input to support the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
 ```
 	dec := s2.NewReader(r)
 	rs, err := dec.ReadSeeker(false, nil)
 	rs.Seek(wantOffset, io.SeekStart)	
 ```
 Get a seeker to seek forward. Since no index is provided, the index is read from the stream.
 This requires that an index was added and that `r` supports the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
 A custom index can be specified which will be used if supplied.
 When using a custom index, it will not be read from the input stream.
 ```
 	dec := s2.NewReader(r)
 	rs, err := dec.ReadSeeker(false, index)
 	rs.Seek(wantOffset, io.SeekStart)	
 ```
 This will read the index from `index`. Since we specify non-random (forward only) seeking `r` does not have to be an io.Seeker
 ```
 	dec := s2.NewReader(r)
 	rs, err := dec.ReadSeeker(true, index)
 	rs.Seek(wantOffset, io.SeekStart)	
 ```
 Finally, since we specify that we want to do random seeking `r` must be an io.Seeker. 
 The returned [ReadSeeker](https://pkg.go.dev/github.com/klauspost/compress/s2#ReadSeeker) contains a shallow reference to the existing Reader,
 meaning changes performed to one is reflected in the other.
 To check if a stream contains an index at the end, the `(*Index).LoadStream(rs io.ReadSeeker) error` can be used.
 ## Manually Forwarding Streams
 Indexes can also be read outside the decoder using the [Index](https://pkg.go.dev/github.com/klauspost/compress/s2#Index) type.
 This can be used for parsing indexes, either separate or in streams.
 In some cases it may not be possible to serve a seekable stream.
 This can for instance be an HTTP stream, where the Range request 
 is sent at the start of the stream. 
 With a little bit of extra code it is still possible to use indexes
 to forward to specific offset with a single forward skip. 
 It is possible to load the index manually like this: 
 ```
 	var index s2.Index
 	_, err = index.Load(idxBytes)
 ```
 This can be used to figure out how much to offset the compressed stream:
 ```
 	compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
 ```
 The `compressedOffset` is the number of bytes that should be skipped 
 from the beginning of the compressed file.
 The `uncompressedOffset` will then be offset of the uncompressed bytes returned
 when decoding from that position. This will always be <= wantOffset.
 When creating a decoder it must be specified that it should *not* expect a stream identifier
 at the beginning of the stream. Assuming the io.Reader `r` has been forwarded to `compressedOffset`
 we create the decoder like this:
 ```
 	dec := s2.NewReader(r, s2.ReaderIgnoreStreamIdentifier())
 ```
 We are not completely done. We still need to forward the stream the uncompressed bytes we didn't want.
 This is done using the regular "Skip" function:
 ```
 	err = dec.Skip(wantOffset - uncompressedOffset)
 ```
 This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
 ## Index Format:
 Each block is structured as a snappy skippable block, with the chunk ID 0x99.
 The block can be read from the front, but contains information so it can be read from the back as well.
 Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), 
 with un-encoded value length of 64 bits, unless other limits are specified. 
 | Content                                                                   | Format                                                                                                                      |
 |---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
 | ID, `[1]byte`                                                           | Always 0x99.                                                                                                                  |
 | Data Length, `[3]byte`                                                  | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
 | Header `[6]byte`                                                        | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
 | UncompressedSize, Varint                                                | Total Uncompressed size.                                                                                                      |
 | CompressedSize, Varint                                                  | Total Compressed size if known. Should be -1 if unknown.                                                                      |
 | EstBlockSize, Varint                                                    | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
 | Entries, Varint                                                         | Number of Entries in index, must be < 65536 and >=0.                                                                          |
 | HasUncompressedOffsets `byte`                                           | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
 | UncompressedOffsets, [Entries]VarInt                                    | Uncompressed offsets. See below how to decode.                                                                                |
 | CompressedOffsets, [Entries]VarInt                                      | Compressed offsets. See below how to decode.                                                                                  |
 | Block Size, `[4]byte`                                                   | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
 | Trailer `[6]byte`                                                       | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
 For regular streams the uncompressed offsets are fully predictable,
 so `HasUncompressedOffsets` allows to specify that compressed blocks all have 
 exactly `EstBlockSize` bytes of uncompressed content.
 Entries *must* be in order, starting with the lowest offset, 
 and there *must* be no uncompressed offset duplicates.  
 Entries *may* point to the start of a skippable block, 
 but it is then not allowed to also have an entry for the next block since 
 that would give an uncompressed offset duplicate.
 There is no requirement for all blocks to be represented in the index. 
 In fact there is a maximum of 65536 block entries in an index.
 The writer can use any method to reduce the number of entries.
 An implicit block start at 0,0 can be assumed.
 ### Decoding entries:
 ```
 // Read Uncompressed entries.
 // Each assumes EstBlockSize delta from previous.
 for each entry {
    uOff = 0
    if HasUncompressedOffsets == 1 {
        uOff = ReadVarInt // Read value from stream
    }
    // Except for the first entry, use previous values.
    if entryNum == 0 {
        entry[entryNum].UncompressedOffset = uOff
        continue
    }
    // Uncompressed uses previous offset and adds EstBlockSize
    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
 }
 // Guess that the first block will be 50% of uncompressed size.
 // Integer truncating division must be used.
 CompressGuess := EstBlockSize / 2
 // Read Compressed entries.
 // Each assumes CompressGuess delta from previous.
 // CompressGuess is adjusted for each value.
 for each entry {
    cOff = ReadVarInt // Read value from stream
    // Except for the first entry, use previous values.
    if entryNum == 0 {
        entry[entryNum].CompressedOffset = cOff
        continue
    }
    // Compressed uses previous and our estimate.
    entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
     // Adjust compressed offset for next loop, integer truncating division must be used. 
     CompressGuess += cOff/2               
 }
 ```
 To decode from any given uncompressed offset `(wantOffset)`:
 * Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
 * Start decoding from `entry[n-1].CompressedOffset`.
 * Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
 See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
 # Format Extensions
 * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
 * [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
 * Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
 Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
 The length is specified by reading the 3-bit length specified in the tag and decode using this table:
 | Length | Actual Length        |
 |--------|----------------------|
 | 0      | 4                    |
 | 1      | 5                    |
 | 2      | 6                    |
 | 3      | 7                    |
 | 4      | 8                    |
 | 5      | 8 + read 1 byte      |
 | 6      | 260 + read 2 bytes   |
 | 7      | 65540 + read 3 bytes |
 This allows any repeat offset + length to be represented by 2 to 5 bytes.
 Lengths are stored as little endian values.
 The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
 Default streaming block size is 1MB.
 # LICENSE
 This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
 Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
--- a/vendor/github.com/klauspost/compress/s2/decode_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@ -0,0 +1,568 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #define R_TMP0 AX
 #define R_TMP1 BX
 #define R_LEN CX
 #define R_OFF DX
 #define R_SRC SI
 #define R_DST DI
 #define R_DBASE R8
 #define R_DLEN R9
 #define R_DEND R10
 #define R_SBASE R11
 #define R_SLEN R12
 #define R_SEND R13
 #define R_TMP2 R14
 #define R_TMP3 R15
 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".
 // func decode(dst, src []byte) int
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
 //	- R_TMP0	scratch
 //	- R_TMP1	scratch
 //	- R_LEN	    length or x (shared)
 //	- R_OFF	    offset
 //	- R_SRC	    &src[s]
 //	- R_DST	    &dst[d]
 //	+ R_DBASE	dst_base
 //	+ R_DLEN	dst_len
 //	+ R_DEND	dst_base + dst_len
 //	+ R_SBASE	src_base
 //	+ R_SLEN	src_len
 //	+ R_SEND	src_base + src_len
 //	- R_TMP2	used by doCopy
 //	- R_TMP3	used by doCopy
 //
 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
 // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
 TEXT ·s2Decode(SB), NOSPLIT, $48-56
 	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
 	MOVQ dst_base+0(FP), R_DBASE
 	MOVQ dst_len+8(FP), R_DLEN
 	MOVQ R_DBASE, R_DST
 	MOVQ R_DBASE, R_DEND
 	ADDQ R_DLEN, R_DEND
 	MOVQ src_base+24(FP), R_SBASE
 	MOVQ src_len+32(FP), R_SLEN
 	MOVQ R_SBASE, R_SRC
 	MOVQ R_SBASE, R_SEND
 	ADDQ R_SLEN, R_SEND
 	XORQ R_OFF, R_OFF
 loop:
 	// for s < len(src)
 	CMPQ R_SRC, R_SEND
 	JEQ  end
 	// R_LEN = uint32(src[s])
 	//
 	// switch src[s] & 0x03
 	MOVBLZX (R_SRC), R_LEN
 	MOVL    R_LEN, R_TMP1
 	ANDL    $3, R_TMP1
 	CMPL    R_TMP1, $1
 	JAE     tagCopy
 	// ----------------------------------------
 	// The code below handles literal tags.
 	// case tagLiteral:
 	// x := uint32(src[s] >> 2)
 	// switch
 	SHRL $2, R_LEN
 	CMPL R_LEN, $60
 	JAE  tagLit60Plus
 	// case x < 60:
 	// s++
 	INCQ R_SRC
 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
 	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.
 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
 	// R_LEN can hold 64 bits, so the increment cannot overflow.
 	INCQ R_LEN
 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
 	// R_TMP0 = len(dst) - d
 	// R_TMP1 = len(src) - s
 	MOVQ R_DEND, R_TMP0
 	SUBQ R_DST, R_TMP0
 	MOVQ R_SEND, R_TMP1
 	SUBQ R_SRC, R_TMP1
 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
 	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 	//   goto callMemmove // Fall back on calling runtime·memmove.
 	// }
 	//
 	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 	// against 21 instead of 16, because it cannot assume that all of its input
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
 	CMPQ R_LEN, $16
 	JGT  callMemmove
 	CMPQ R_TMP0, $16
 	JLT  callMemmove
 	CMPQ R_TMP1, $16
 	JLT  callMemmove
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
 	//
 	// This always copies 16 bytes, instead of only length bytes, but that's
 	// OK. If the input is a valid Snappy encoding then subsequent iterations
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
 	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
 	MOVOU 0(R_SRC), X0
 	MOVOU X0, 0(R_DST)
 	// d += length
 	// s += length
 	ADDQ R_LEN, R_DST
 	ADDQ R_LEN, R_SRC
 	JMP  loop
 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
 	CMPQ R_LEN, R_TMP0
 	JGT  errCorrupt
 	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt
 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
 	MOVQ R_DST, 0(SP)
 	MOVQ R_SRC, 8(SP)
 	MOVQ R_LEN, 16(SP)
 	MOVQ R_DST, 24(SP)
 	MOVQ R_SRC, 32(SP)
 	MOVQ R_LEN, 40(SP)
 	MOVQ R_OFF, 48(SP)
 	CALL runtime·memmove(SB)
 	// Restore local variables: unspill registers from the stack and
 	// re-calculate R_DBASE-R_SEND.
 	MOVQ 24(SP), R_DST
 	MOVQ 32(SP), R_SRC
 	MOVQ 40(SP), R_LEN
 	MOVQ 48(SP), R_OFF
 	MOVQ dst_base+0(FP), R_DBASE
 	MOVQ dst_len+8(FP), R_DLEN
 	MOVQ R_DBASE, R_DEND
 	ADDQ R_DLEN, R_DEND
 	MOVQ src_base+24(FP), R_SBASE
 	MOVQ src_len+32(FP), R_SLEN
 	MOVQ R_SBASE, R_SEND
 	ADDQ R_SLEN, R_SEND
 	// d += length
 	// s += length
 	ADDQ R_LEN, R_DST
 	ADDQ R_LEN, R_SRC
 	JMP  loop
 tagLit60Plus:
 	// !!! This fragment does the
 	//
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
 	ADDQ R_LEN, R_SRC
 	SUBQ $58, R_SRC
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// case x == 60:
 	CMPL R_LEN, $61
 	JEQ  tagLit61
 	JA   tagLit62Plus
 	// x = uint32(src[s-1])
 	MOVBLZX -1(R_SRC), R_LEN
 	JMP     doLit
 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
 	MOVWLZX -2(R_SRC), R_LEN
 	JMP     doLit
 tagLit62Plus:
 	CMPL R_LEN, $62
 	JA   tagLit63
 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 	// We read one byte, safe to read one back, since we are just reading tag.
 	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
 	MOVL -4(R_SRC), R_LEN
 	SHRL $8, R_LEN
 	JMP  doLit
 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 	MOVL -4(R_SRC), R_LEN
 	JMP  doLit
 // The code above handles literal tags.
 // ----------------------------------------
 // The code below handles copy tags.
 tagCopy4:
 	// case tagCopy4:
 	// s += 5
 	ADDQ $5, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// length = 1 + int(src[s-5])>>2
 	SHRQ $2, R_LEN
 	INCQ R_LEN
 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 	MOVLQZX -4(R_SRC), R_OFF
 	JMP     doCopy
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
 	ADDQ $3, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// length = 1 + int(src[s-3])>>2
 	SHRQ $2, R_LEN
 	INCQ R_LEN
 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 	MOVWQZX -2(R_SRC), R_OFF
 	JMP     doCopy
 tagCopy:
 	// We have a copy tag. We assume that:
 	//	- R_TMP1 == src[s] & 0x03
 	//	- R_LEN == src[s]
 	CMPQ R_TMP1, $2
 	JEQ  tagCopy2
 	JA   tagCopy4
 	// case tagCopy1:
 	// s += 2
 	ADDQ $2, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 	// length = 4 + int(src[s-2])>>2&0x7
 	MOVBQZX -1(R_SRC), R_TMP1
 	MOVQ    R_LEN, R_TMP0
 	SHRQ    $2, R_LEN
 	ANDQ    $0xe0, R_TMP0
 	ANDQ    $7, R_LEN
 	SHLQ    $3, R_TMP0
 	ADDQ    $4, R_LEN
 	ORQ     R_TMP1, R_TMP0
 	// check if repeat code, ZF set by ORQ.
 	JZ repeatCode
 	// This is a regular copy, transfer our temporary value to R_OFF (length)
 	MOVQ R_TMP0, R_OFF
 	JMP  doCopy
 // This is a repeat code.
 repeatCode:
 	// If length < 9, reuse last offset, with the length already calculated.
 	CMPQ R_LEN, $9
 	JL   doCopyRepeat
 	// Read additional bytes for length.
 	JE repeatLen1
 	// Rare, so the extra branch shouldn't hurt too much.
 	CMPQ R_LEN, $10
 	JE   repeatLen2
 	JMP  repeatLen3
 // Read repeat lengths.
 repeatLen1:
 	// s ++
 	ADDQ $1, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// length = src[s-1] + 8
 	MOVBQZX -1(R_SRC), R_LEN
 	ADDL    $8, R_LEN
 	JMP     doCopyRepeat
 repeatLen2:
 	// s +=2
 	ADDQ $2, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
 	MOVWQZX -2(R_SRC), R_LEN
 	ADDL    $260, R_LEN
 	JMP     doCopyRepeat
 repeatLen3:
 	// s +=3
 	ADDQ $3, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
 	// Read one byte further back (just part of the tag, shifted out)
 	MOVL -4(R_SRC), R_LEN
 	SHRL $8, R_LEN
 	ADDL $65540, R_LEN
 	JMP  doCopyRepeat
 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
 	//	- R_LEN == length && R_LEN > 0
 	//	- R_OFF == offset
 	// if d < offset { etc }
 	MOVQ R_DST, R_TMP1
 	SUBQ R_DBASE, R_TMP1
 	CMPQ R_TMP1, R_OFF
 	JLT  errCorrupt
 	// Repeat values can skip the test above, since any offset > 0 will be in dst.
 doCopyRepeat:
 	// if offset <= 0 { etc }
 	CMPQ R_OFF, $0
 	JLE  errCorrupt
 	// if length > len(dst)-d { etc }
 	MOVQ R_DEND, R_TMP1
 	SUBQ R_DST, R_TMP1
 	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
 	//	- R_TMP2 = len(dst)-d
 	//	- R_TMP3 = &dst[d-offset]
 	MOVQ R_DEND, R_TMP2
 	SUBQ R_DST, R_TMP2
 	MOVQ R_DST, R_TMP3
 	SUBQ R_OFF, R_TMP3
 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
 	// First, try using two 8-byte load/stores, similar to the doLit technique
 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 	// and not one 16-byte load/store, and the first store has to be before the
 	// second load, due to the overlap if offset is in the range [8, 16).
 	//
 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 	//   goto slowForwardCopy
 	// }
 	// copy 16 bytes
 	// d += length
 	CMPQ R_LEN, $16
 	JGT  slowForwardCopy
 	CMPQ R_OFF, $8
 	JLT  slowForwardCopy
 	CMPQ R_TMP2, $16
 	JLT  slowForwardCopy
 	MOVQ 0(R_TMP3), R_TMP0
 	MOVQ R_TMP0, 0(R_DST)
 	MOVQ 8(R_TMP3), R_TMP1
 	MOVQ R_TMP1, 8(R_DST)
 	ADDQ R_LEN, R_DST
 	JMP  loop
 slowForwardCopy:
 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 	// can still try 8-byte load stores, provided we can overrun up to 10 extra
 	// bytes. As above, the overrun will be fixed up by subsequent iterations
 	// of the outermost loop.
 	//
 	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 	// commentary says:
 	//
 	// ----
 	//
 	// The main part of this loop is a simple copy of eight bytes at a time
 	// until we've copied (at least) the requested amount of bytes.  However,
 	// if d and d-offset are less than eight bytes apart (indicating a
 	// repeating pattern of length < 8), we first need to expand the pattern in
 	// order to get the correct results. For instance, if the buffer looks like
 	// this, with the eight-byte <d-offset> and <d> patterns marked as
 	// intervals:
 	//
 	//    abxxxxxxxxxxxx
 	//    [------]           d-offset
 	//      [------]         d
 	//
 	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 	// once, after which we can move <d> two bytes without moving <d-offset>:
 	//
 	//    ababxxxxxxxxxx
 	//    [------]           d-offset
 	//        [------]       d
 	//
 	// and repeat the exercise until the two no longer overlap.
 	//
 	// This allows us to do very well in the special case of one single byte
 	// repeated many times, without taking a big hit for more general cases.
 	//
 	// The worst case of extra writing past the end of the match occurs when
 	// offset == 1 and length == 1; the last copy will read from byte positions
 	// [0..7] and write to [4..11], whereas it was only supposed to write to
 	// position 1. Thus, ten excess bytes.
 	//
 	// ----
 	//
 	// That "10 byte overrun" worst case is confirmed by Go's
 	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 	// and finishSlowForwardCopy algorithm.
 	//
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
 	SUBQ $10, R_TMP2
 	CMPQ R_LEN, R_TMP2
 	JGT  verySlowForwardCopy
 	// We want to keep the offset, so we use R_TMP2 from here.
 	MOVQ R_OFF, R_TMP2
 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
 	// 8-byte load/stores.
 	//
 	// for offset < 8 {
 	//   copy 8 bytes from dst[d-offset:] to dst[d:]
 	//   length -= offset
 	//   d      += offset
 	//   offset += offset
 	//   // The two previous lines together means that d-offset, and therefore
 	//   // R_TMP3, is unchanged.
 	// }
 	CMPQ R_TMP2, $8
 	JGE  fixUpSlowForwardCopy
 	MOVQ (R_TMP3), R_TMP1
 	MOVQ R_TMP1, (R_DST)
 	SUBQ R_TMP2, R_LEN
 	ADDQ R_TMP2, R_DST
 	ADDQ R_TMP2, R_TMP2
 	JMP  makeOffsetAtLeast8
 fixUpSlowForwardCopy:
 	// !!! Add length (which might be negative now) to d (implied by R_DST being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
 	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
 	MOVQ R_DST, R_TMP0
 	ADDQ R_LEN, R_DST
 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
 	CMPQ R_LEN, $0
 	JLE  loop
 	MOVQ (R_TMP3), R_TMP1
 	MOVQ R_TMP1, (R_TMP0)
 	ADDQ $8, R_TMP3
 	ADDQ $8, R_TMP0
 	SUBQ $8, R_LEN
 	JMP  finishSlowForwardCopy
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know
 	// that length > 0. In Go syntax:
 	//
 	// for {
 	//   dst[d] = dst[d - offset]
 	//   d++
 	//   length--
 	//   if length == 0 {
 	//     break
 	//   }
 	// }
 	MOVB (R_TMP3), R_TMP1
 	MOVB R_TMP1, (R_DST)
 	INCQ R_TMP3
 	INCQ R_DST
 	DECQ R_LEN
 	JNZ  verySlowForwardCopy
 	JMP  loop
 // The code above handles copy tags.
 // ----------------------------------------
 end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
 	CMPQ R_DST, R_DEND
 	JNE  errCorrupt
 	// return 0
 	MOVQ $0, ret+48(FP)
 	RET
 errCorrupt:
 	// return decodeErrCodeCorrupt
 	MOVQ $1, ret+48(FP)
 	RET
--- a/vendor/github.com/klauspost/compress/s2/decode_arm64.s
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@ -0,0 +1,574 @@
 // Copyright 2020 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #define R_TMP0 R2
 #define R_TMP1 R3
 #define R_LEN R4
 #define R_OFF R5
 #define R_SRC R6
 #define R_DST R7
 #define R_DBASE R8
 #define R_DLEN R9
 #define R_DEND R10
 #define R_SBASE R11
 #define R_SLEN R12
 #define R_SEND R13
 #define R_TMP2 R14
 #define R_TMP3 R15
 // TEST_SRC will check if R_SRC is <= SRC_END
 #define TEST_SRC() \
 	CMP R_SEND, R_SRC \
 	BGT errCorrupt
 // MOVD R_SRC, R_TMP1
 // SUB  R_SBASE, R_TMP1, R_TMP1
 // CMP  R_SLEN, R_TMP1
 // BGT  errCorrupt
 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".
 // func decode(dst, src []byte) int
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
 //	- R_TMP0	scratch
 //	- R_TMP1	scratch
 //	- R_LEN	length or x
 //	- R_OFF	offset
 //	- R_SRC	&src[s]
 //	- R_DST	&dst[d]
 //	+ R_DBASE	dst_base
 //	+ R_DLEN	dst_len
 //	+ R_DEND	dst_base + dst_len
 //	+ R_SBASE	src_base
 //	+ R_SLEN	src_len
 //	+ R_SEND	src_base + src_len
 //	- R_TMP2	used by doCopy
 //	- R_TMP3	used by doCopy
 //
 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
 // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
 TEXT ·s2Decode(SB), NOSPLIT, $56-64
 	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
 	MOVD dst_base+0(FP), R_DBASE
 	MOVD dst_len+8(FP), R_DLEN
 	MOVD R_DBASE, R_DST
 	MOVD R_DBASE, R_DEND
 	ADD  R_DLEN, R_DEND, R_DEND
 	MOVD src_base+24(FP), R_SBASE
 	MOVD src_len+32(FP), R_SLEN
 	MOVD R_SBASE, R_SRC
 	MOVD R_SBASE, R_SEND
 	ADD  R_SLEN, R_SEND, R_SEND
 	MOVD $0, R_OFF
 loop:
 	// for s < len(src)
 	CMP R_SEND, R_SRC
 	BEQ end
 	// R_LEN = uint32(src[s])
 	//
 	// switch src[s] & 0x03
 	MOVBU (R_SRC), R_LEN
 	MOVW  R_LEN, R_TMP1
 	ANDW  $3, R_TMP1
 	MOVW  $1, R1
 	CMPW  R1, R_TMP1
 	BGE   tagCopy
 	// ----------------------------------------
 	// The code below handles literal tags.
 	// case tagLiteral:
 	// x := uint32(src[s] >> 2)
 	// switch
 	MOVW $60, R1
 	LSRW $2, R_LEN, R_LEN
 	CMPW R_LEN, R1
 	BLS  tagLit60Plus
 	// case x < 60:
 	// s++
 	ADD $1, R_SRC, R_SRC
 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
 	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.
 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
 	// R_LEN can hold 64 bits, so the increment cannot overflow.
 	ADD $1, R_LEN, R_LEN
 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
 	// R_TMP0 = len(dst) - d
 	// R_TMP1 = len(src) - s
 	MOVD R_DEND, R_TMP0
 	SUB  R_DST, R_TMP0, R_TMP0
 	MOVD R_SEND, R_TMP1
 	SUB  R_SRC, R_TMP1, R_TMP1
 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
 	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 	//   goto callMemmove // Fall back on calling runtime·memmove.
 	// }
 	//
 	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 	// against 21 instead of 16, because it cannot assume that all of its input
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
 	CMP $16, R_LEN
 	BGT callMemmove
 	CMP $16, R_TMP0
 	BLT callMemmove
 	CMP $16, R_TMP1
 	BLT callMemmove
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
 	//
 	// This always copies 16 bytes, instead of only length bytes, but that's
 	// OK. If the input is a valid Snappy encoding then subsequent iterations
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
 	LDP 0(R_SRC), (R_TMP2, R_TMP3)
 	STP (R_TMP2, R_TMP3), 0(R_DST)
 	// d += length
 	// s += length
 	ADD R_LEN, R_DST, R_DST
 	ADD R_LEN, R_SRC, R_SRC
 	B   loop
 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
 	CMP R_TMP0, R_LEN
 	BGT errCorrupt
 	CMP R_TMP1, R_LEN
 	BGT errCorrupt
 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
 	MOVD R_DST, 8(RSP)
 	MOVD R_SRC, 16(RSP)
 	MOVD R_LEN, 24(RSP)
 	MOVD R_DST, 32(RSP)
 	MOVD R_SRC, 40(RSP)
 	MOVD R_LEN, 48(RSP)
 	MOVD R_OFF, 56(RSP)
 	CALL runtime·memmove(SB)
 	// Restore local variables: unspill registers from the stack and
 	// re-calculate R_DBASE-R_SEND.
 	MOVD 32(RSP), R_DST
 	MOVD 40(RSP), R_SRC
 	MOVD 48(RSP), R_LEN
 	MOVD 56(RSP), R_OFF
 	MOVD dst_base+0(FP), R_DBASE
 	MOVD dst_len+8(FP), R_DLEN
 	MOVD R_DBASE, R_DEND
 	ADD  R_DLEN, R_DEND, R_DEND
 	MOVD src_base+24(FP), R_SBASE
 	MOVD src_len+32(FP), R_SLEN
 	MOVD R_SBASE, R_SEND
 	ADD  R_SLEN, R_SEND, R_SEND
 	// d += length
 	// s += length
 	ADD R_LEN, R_DST, R_DST
 	ADD R_LEN, R_SRC, R_SRC
 	B   loop
 tagLit60Plus:
 	// !!! This fragment does the
 	//
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
 	ADD R_LEN, R_SRC, R_SRC
 	SUB $58, R_SRC, R_SRC
 	TEST_SRC()
 	// case x == 60:
 	MOVW $61, R1
 	CMPW R1, R_LEN
 	BEQ  tagLit61
 	BGT  tagLit62Plus
 	// x = uint32(src[s-1])
 	MOVBU -1(R_SRC), R_LEN
 	B     doLit
 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
 	MOVHU -2(R_SRC), R_LEN
 	B     doLit
 tagLit62Plus:
 	CMPW $62, R_LEN
 	BHI  tagLit63
 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 	MOVHU -3(R_SRC), R_LEN
 	MOVBU -1(R_SRC), R_TMP1
 	ORR   R_TMP1<<16, R_LEN
 	B     doLit
 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 	MOVWU -4(R_SRC), R_LEN
 	B     doLit
 	// The code above handles literal tags.
 	// ----------------------------------------
 	// The code below handles copy tags.
 tagCopy4:
 	// case tagCopy4:
 	// s += 5
 	ADD $5, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	MOVD R_SRC, R_TMP1
 	SUB  R_SBASE, R_TMP1, R_TMP1
 	CMP  R_SLEN, R_TMP1
 	BGT  errCorrupt
 	// length = 1 + int(src[s-5])>>2
 	MOVD $1, R1
 	ADD  R_LEN>>2, R1, R_LEN
 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 	MOVWU -4(R_SRC), R_OFF
 	B     doCopy
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
 	ADD $3, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	TEST_SRC()
 	// length = 1 + int(src[s-3])>>2
 	MOVD $1, R1
 	ADD  R_LEN>>2, R1, R_LEN
 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 	MOVHU -2(R_SRC), R_OFF
 	B     doCopy
 tagCopy:
 	// We have a copy tag. We assume that:
 	//	- R_TMP1 == src[s] & 0x03
 	//	- R_LEN == src[s]
 	CMP $2, R_TMP1
 	BEQ tagCopy2
 	BGT tagCopy4
 	// case tagCopy1:
 	// s += 2
 	ADD $2, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	TEST_SRC()
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 	// Calculate offset in R_TMP0 in case it is a repeat.
 	MOVD  R_LEN, R_TMP0
 	AND   $0xe0, R_TMP0
 	MOVBU -1(R_SRC), R_TMP1
 	ORR   R_TMP0<<3, R_TMP1, R_TMP0
 	// length = 4 + int(src[s-2])>>2&0x7
 	MOVD $7, R1
 	AND  R_LEN>>2, R1, R_LEN
 	ADD  $4, R_LEN, R_LEN
 	// check if repeat code with offset 0.
 	CMP $0, R_TMP0
 	BEQ repeatCode
 	// This is a regular copy, transfer our temporary value to R_OFF (offset)
 	MOVD R_TMP0, R_OFF
 	B    doCopy
 	// This is a repeat code.
 repeatCode:
 	// If length < 9, reuse last offset, with the length already calculated.
 	CMP $9, R_LEN
 	BLT doCopyRepeat
 	BEQ repeatLen1
 	CMP $10, R_LEN
 	BEQ repeatLen2
 repeatLen3:
 	// s +=3
 	ADD $3, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	TEST_SRC()
 	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
 	MOVBU -1(R_SRC), R_TMP0
 	MOVHU -3(R_SRC), R_LEN
 	ORR   R_TMP0<<16, R_LEN, R_LEN
 	ADD   $65540, R_LEN, R_LEN
 	B     doCopyRepeat
 repeatLen2:
 	// s +=2
 	ADD $2, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	TEST_SRC()
 	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
 	MOVHU -2(R_SRC), R_LEN
 	ADD   $260, R_LEN, R_LEN
 	B     doCopyRepeat
 repeatLen1:
 	// s +=1
 	ADD $1, R_SRC, R_SRC
 	// if uint(s) > uint(len(src)) { etc }
 	TEST_SRC()
 	// length = src[s-1] + 8
 	MOVBU -1(R_SRC), R_LEN
 	ADD   $8, R_LEN, R_LEN
 	B     doCopyRepeat
 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
 	//	- R_LEN == length && R_LEN > 0
 	//	- R_OFF == offset
 	// if d < offset { etc }
 	MOVD R_DST, R_TMP1
 	SUB  R_DBASE, R_TMP1, R_TMP1
 	CMP  R_OFF, R_TMP1
 	BLT  errCorrupt
 	// Repeat values can skip the test above, since any offset > 0 will be in dst.
 doCopyRepeat:
 	// if offset <= 0 { etc }
 	CMP $0, R_OFF
 	BLE errCorrupt
 	// if length > len(dst)-d { etc }
 	MOVD R_DEND, R_TMP1
 	SUB  R_DST, R_TMP1, R_TMP1
 	CMP  R_TMP1, R_LEN
 	BGT  errCorrupt
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
 	//	- R_TMP2 = len(dst)-d
 	//	- R_TMP3 = &dst[d-offset]
 	MOVD R_DEND, R_TMP2
 	SUB  R_DST, R_TMP2, R_TMP2
 	MOVD R_DST, R_TMP3
 	SUB  R_OFF, R_TMP3, R_TMP3
 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
 	// First, try using two 8-byte load/stores, similar to the doLit technique
 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 	// and not one 16-byte load/store, and the first store has to be before the
 	// second load, due to the overlap if offset is in the range [8, 16).
 	//
 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 	//   goto slowForwardCopy
 	// }
 	// copy 16 bytes
 	// d += length
 	CMP  $16, R_LEN
 	BGT  slowForwardCopy
 	CMP  $8, R_OFF
 	BLT  slowForwardCopy
 	CMP  $16, R_TMP2
 	BLT  slowForwardCopy
 	MOVD 0(R_TMP3), R_TMP0
 	MOVD R_TMP0, 0(R_DST)
 	MOVD 8(R_TMP3), R_TMP1
 	MOVD R_TMP1, 8(R_DST)
 	ADD  R_LEN, R_DST, R_DST
 	B    loop
 slowForwardCopy:
 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 	// can still try 8-byte load stores, provided we can overrun up to 10 extra
 	// bytes. As above, the overrun will be fixed up by subsequent iterations
 	// of the outermost loop.
 	//
 	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 	// commentary says:
 	//
 	// ----
 	//
 	// The main part of this loop is a simple copy of eight bytes at a time
 	// until we've copied (at least) the requested amount of bytes.  However,
 	// if d and d-offset are less than eight bytes apart (indicating a
 	// repeating pattern of length < 8), we first need to expand the pattern in
 	// order to get the correct results. For instance, if the buffer looks like
 	// this, with the eight-byte <d-offset> and <d> patterns marked as
 	// intervals:
 	//
 	//    abxxxxxxxxxxxx
 	//    [------]           d-offset
 	//      [------]         d
 	//
 	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 	// once, after which we can move <d> two bytes without moving <d-offset>:
 	//
 	//    ababxxxxxxxxxx
 	//    [------]           d-offset
 	//        [------]       d
 	//
 	// and repeat the exercise until the two no longer overlap.
 	//
 	// This allows us to do very well in the special case of one single byte
 	// repeated many times, without taking a big hit for more general cases.
 	//
 	// The worst case of extra writing past the end of the match occurs when
 	// offset == 1 and length == 1; the last copy will read from byte positions
 	// [0..7] and write to [4..11], whereas it was only supposed to write to
 	// position 1. Thus, ten excess bytes.
 	//
 	// ----
 	//
 	// That "10 byte overrun" worst case is confirmed by Go's
 	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 	// and finishSlowForwardCopy algorithm.
 	//
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
 	SUB $10, R_TMP2, R_TMP2
 	CMP R_TMP2, R_LEN
 	BGT verySlowForwardCopy
 	// We want to keep the offset, so we use R_TMP2 from here.
 	MOVD R_OFF, R_TMP2
 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
 	// 8-byte load/stores.
 	//
 	// for offset < 8 {
 	//   copy 8 bytes from dst[d-offset:] to dst[d:]
 	//   length -= offset
 	//   d      += offset
 	//   offset += offset
 	//   // The two previous lines together means that d-offset, and therefore
 	//   // R_TMP3, is unchanged.
 	// }
 	CMP  $8, R_TMP2
 	BGE  fixUpSlowForwardCopy
 	MOVD (R_TMP3), R_TMP1
 	MOVD R_TMP1, (R_DST)
 	SUB  R_TMP2, R_LEN, R_LEN
 	ADD  R_TMP2, R_DST, R_DST
 	ADD  R_TMP2, R_TMP2, R_TMP2
 	B    makeOffsetAtLeast8
 fixUpSlowForwardCopy:
 	// !!! Add length (which might be negative now) to d (implied by R_DST being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
 	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
 	MOVD R_DST, R_TMP0
 	ADD  R_LEN, R_DST, R_DST
 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
 	MOVD $0, R1
 	CMP  R1, R_LEN
 	BLE  loop
 	MOVD (R_TMP3), R_TMP1
 	MOVD R_TMP1, (R_TMP0)
 	ADD  $8, R_TMP3, R_TMP3
 	ADD  $8, R_TMP0, R_TMP0
 	SUB  $8, R_LEN, R_LEN
 	B    finishSlowForwardCopy
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know
 	// that length > 0. In Go syntax:
 	//
 	// for {
 	//   dst[d] = dst[d - offset]
 	//   d++
 	//   length--
 	//   if length == 0 {
 	//     break
 	//   }
 	// }
 	MOVB (R_TMP3), R_TMP1
 	MOVB R_TMP1, (R_DST)
 	ADD  $1, R_TMP3, R_TMP3
 	ADD  $1, R_DST, R_DST
 	SUB  $1, R_LEN, R_LEN
 	CBNZ R_LEN, verySlowForwardCopy
 	B    loop
 	// The code above handles copy tags.
 	// ----------------------------------------
 end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
 	CMP R_DEND, R_DST
 	BNE errCorrupt
 	// return 0
 	MOVD $0, ret+48(FP)
 	RET
 errCorrupt:
 	// return decodeErrCodeCorrupt
 	MOVD $1, R_TMP0
 	MOVD R_TMP0, ret+48(FP)
 	RET
--- a/vendor/github.com/klauspost/compress/s2/decode_asm.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@ -0,0 +1,17 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build (amd64 || arm64) && !appengine && gc && !noasm
 // +build amd64 arm64
 // +build !appengine
 // +build gc
 // +build !noasm
 package s2
 // decode has the same semantics as in decode_other.go.
 //
 //go:noescape
 func s2Decode(dst, src []byte) int
--- a/vendor/github.com/klauspost/compress/s2/decode_other.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@ -0,0 +1,267 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build (!amd64 && !arm64) || appengine || !gc || noasm
 // +build !amd64,!arm64 appengine !gc noasm
 package s2
 import (
 	"fmt"
 	"strconv"
 )
 // decode writes the decoding of src to dst. It assumes that the varint-encoded
 // length of the decompressed bytes has already been read, and that len(dst)
 // equals that length.
 //
 // It returns 0 on success or a decodeErrCodeXxx error code on failure.
 func s2Decode(dst, src []byte) int {
 	const debug = false
 	if debug {
 		fmt.Println("Starting decode, dst len:", len(dst))
 	}
 	var d, s, length int
 	offset := 0
 	// As long as we can read at least 5 bytes...
 	for s < len(src)-5 {
 		switch src[s] & 0x03 {
 		case tagLiteral:
 			x := uint32(src[s] >> 2)
 			switch {
 			case x < 60:
 				s++
 			case x == 60:
 				s += 2
 				x = uint32(src[s-1])
 			case x == 61:
 				s += 3
 				x = uint32(src[s-2]) | uint32(src[s-1])<<8
 			case x == 62:
 				s += 4
 				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 			case x == 63:
 				s += 5
 				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 			}
 			length = int(x) + 1
 			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
 				return decodeErrCodeCorrupt
 			}
 			if debug {
 				fmt.Println("literals, length:", length, "d-after:", d+length)
 			}
 			copy(dst[d:], src[s:s+length])
 			d += length
 			s += length
 			continue
 		case tagCopy1:
 			s += 2
 			length = int(src[s-2]) >> 2 & 0x7
 			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 			if toffset == 0 {
 				if debug {
 					fmt.Print("(repeat) ")
 				}
 				// keep last offset
 				switch length {
 				case 5:
 					s += 1
 					length = int(uint32(src[s-1])) + 4
 				case 6:
 					s += 2
 					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
 				case 7:
 					s += 3
 					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
 				default: // 0-> 4
 				}
 			} else {
 				offset = toffset
 			}
 			length += 4
 		case tagCopy2:
 			s += 3
 			length = 1 + int(src[s-3])>>2
 			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 		case tagCopy4:
 			s += 5
 			length = 1 + int(src[s-5])>>2
 			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 		}
 		if offset <= 0 || d < offset || length > len(dst)-d {
 			return decodeErrCodeCorrupt
 		}
 		if debug {
 			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
 		}
 		// Copy from an earlier sub-slice of dst to a later sub-slice.
 		// If no overlap, use the built-in copy:
 		if offset > length {
 			copy(dst[d:d+length], dst[d-offset:])
 			d += length
 			continue
 		}
 		// Unlike the built-in copy function, this byte-by-byte copy always runs
 		// forwards, even if the slices overlap. Conceptually, this is:
 		//
 		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
 		//
 		// We align the slices into a and b and show the compiler they are the same size.
 		// This allows the loop to run without bounds checks.
 		a := dst[d : d+length]
 		b := dst[d-offset:]
 		b = b[:len(a)]
 		for i := range a {
 			a[i] = b[i]
 		}
 		d += length
 	}
 	// Remaining with extra checks...
 	for s < len(src) {
 		switch src[s] & 0x03 {
 		case tagLiteral:
 			x := uint32(src[s] >> 2)
 			switch {
 			case x < 60:
 				s++
 			case x == 60:
 				s += 2
 				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 					return decodeErrCodeCorrupt
 				}
 				x = uint32(src[s-1])
 			case x == 61:
 				s += 3
 				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 					return decodeErrCodeCorrupt
 				}
 				x = uint32(src[s-2]) | uint32(src[s-1])<<8
 			case x == 62:
 				s += 4
 				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 					return decodeErrCodeCorrupt
 				}
 				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 			case x == 63:
 				s += 5
 				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 					return decodeErrCodeCorrupt
 				}
 				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 			}
 			length = int(x) + 1
 			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
 				return decodeErrCodeCorrupt
 			}
 			if debug {
 				fmt.Println("literals, length:", length, "d-after:", d+length)
 			}
 			copy(dst[d:], src[s:s+length])
 			d += length
 			s += length
 			continue
 		case tagCopy1:
 			s += 2
 			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 				return decodeErrCodeCorrupt
 			}
 			length = int(src[s-2]) >> 2 & 0x7
 			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 			if toffset == 0 {
 				if debug {
 					fmt.Print("(repeat) ")
 				}
 				// keep last offset
 				switch length {
 				case 5:
 					s += 1
 					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 						return decodeErrCodeCorrupt
 					}
 					length = int(uint32(src[s-1])) + 4
 				case 6:
 					s += 2
 					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 						return decodeErrCodeCorrupt
 					}
 					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
 				case 7:
 					s += 3
 					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 						return decodeErrCodeCorrupt
 					}
 					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
 				default: // 0-> 4
 				}
 			} else {
 				offset = toffset
 			}
 			length += 4
 		case tagCopy2:
 			s += 3
 			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 				return decodeErrCodeCorrupt
 			}
 			length = 1 + int(src[s-3])>>2
 			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 		case tagCopy4:
 			s += 5
 			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
 				return decodeErrCodeCorrupt
 			}
 			length = 1 + int(src[s-5])>>2
 			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 		}
 		if offset <= 0 || d < offset || length > len(dst)-d {
 			return decodeErrCodeCorrupt
 		}
 		if debug {
 			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
 		}
 		// Copy from an earlier sub-slice of dst to a later sub-slice.
 		// If no overlap, use the built-in copy:
 		if offset > length {
 			copy(dst[d:d+length], dst[d-offset:])
 			d += length
 			continue
 		}
 		// Unlike the built-in copy function, this byte-by-byte copy always runs
 		// forwards, even if the slices overlap. Conceptually, this is:
 		//
 		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
 		//
 		// We align the slices into a and b and show the compiler they are the same size.
 		// This allows the loop to run without bounds checks.
 		a := dst[d : d+length]
 		b := dst[d-offset:]
 		b = b[:len(a)]
 		for i := range a {
 			a[i] = b[i]
 		}
 		d += length
 	}
 	if d != len(dst) {
 		return decodeErrCodeCorrupt
 	}
 	return 0
 }
--- a/vendor/github.com/klauspost/compress/s2/encode.go
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
--- a/vendor/github.com/klauspost/compress/s2/encode_all.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@ -0,0 +1,456 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package s2
 import (
 	"bytes"
 	"encoding/binary"
 	"math/bits"
 )
 func load32(b []byte, i int) uint32 {
 	return binary.LittleEndian.Uint32(b[i:])
 }
 func load64(b []byte, i int) uint64 {
 	return binary.LittleEndian.Uint64(b[i:])
 }
 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash6(u uint64, h uint8) uint32 {
 	const prime6bytes = 227718039650203
 	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
 }
 func encodeGo(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if len(dst) < n {
 		dst = make([]byte, n)
 	}
 	// The block starts with the varint-encoded length of the decompressed bytes.
 	d := binary.PutUvarint(dst, uint64(len(src)))
 	if len(src) == 0 {
 		return dst[:d]
 	}
 	if len(src) < minNonLiteralBlockSize {
 		d += emitLiteral(dst[d:], src)
 		return dst[:d]
 	}
 	n := encodeBlockGo(dst[d:], src)
 	if n > 0 {
 		d += n
 		return dst[:d]
 	}
 	// Not compressible
 	d += emitLiteral(dst[d:], src)
 	return dst[:d]
 }
 // encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockGo(dst, src []byte) (d int) {
 	// Initialize the hash table.
 	const (
 		tableBits    = 14
 		maxTableSize = 1 << tableBits
 		debug = false
 	)
 	var table [maxTableSize]uint32
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - len(src)>>5 - 5
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
 	repeat := 1
 	for {
 		candidate := 0
 		for {
 			// Next src position to check
 			nextS := s + (s-nextEmit)>>6 + 4
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hash0 := hash6(cv, tableBits)
 			hash1 := hash6(cv>>8, tableBits)
 			candidate = int(table[hash0])
 			candidate2 := int(table[hash1])
 			table[hash0] = uint32(s)
 			table[hash1] = uint32(s + 1)
 			hash2 := hash6(cv>>16, tableBits)
 			// Check repeat at offset checkRep.
 			const checkRep = 1
 			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
 				base := s + checkRep
 				// Extend back
 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
 					i--
 					base--
 				}
 				d += emitLiteral(dst[d:], src[nextEmit:base])
 				// Extend forward
 				candidate := s - repeat + 4 + checkRep
 				s += 4 + checkRep
 				for s <= sLimit {
 					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 						s += bits.TrailingZeros64(diff) >> 3
 						break
 					}
 					s += 8
 					candidate += 8
 				}
 				if debug {
 					// Validate match.
 					if s <= candidate {
 						panic("s <= candidate")
 					}
 					a := src[base:s]
 					b := src[base-repeat : base-repeat+(s-base)]
 					if !bytes.Equal(a, b) {
 						panic("mismatch")
 					}
 				}
 				if nextEmit > 0 {
 					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
 					d += emitRepeat(dst[d:], repeat, s-base)
 				} else {
 					// First match, cannot be repeat.
 					d += emitCopy(dst[d:], repeat, s-base)
 				}
 				nextEmit = s
 				if s >= sLimit {
 					goto emitRemainder
 				}
 				cv = load64(src, s)
 				continue
 			}
 			if uint32(cv) == load32(src, candidate) {
 				break
 			}
 			candidate = int(table[hash2])
 			if uint32(cv>>8) == load32(src, candidate2) {
 				table[hash2] = uint32(s + 2)
 				candidate = candidate2
 				s++
 				break
 			}
 			table[hash2] = uint32(s + 2)
 			if uint32(cv>>16) == load32(src, candidate) {
 				s += 2
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards.
 		// The top bytes will be rechecked to get the full match.
 		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
 			candidate--
 			s--
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		d += emitLiteral(dst[d:], src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			base := s
 			repeat = base - candidate
 			// Extend the 4-byte match as long as possible.
 			s += 4
 			candidate += 4
 			for s <= len(src)-8 {
 				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 					s += bits.TrailingZeros64(diff) >> 3
 					break
 				}
 				s += 8
 				candidate += 8
 			}
 			d += emitCopy(dst[d:], repeat, s-base)
 			if debug {
 				// Validate match.
 				if s <= candidate {
 					panic("s <= candidate")
 				}
 				a := src[base:s]
 				b := src[base-repeat : base-repeat+(s-base)]
 				if !bytes.Equal(a, b) {
 					panic("mismatch")
 				}
 			}
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			if d > dstLimit {
 				// Do we have space for more, if not bail.
 				return 0
 			}
 			// Check for an immediate match, otherwise start search at s+1
 			x := load64(src, s-2)
 			m2Hash := hash6(x, tableBits)
 			currHash := hash6(x>>16, tableBits)
 			candidate = int(table[currHash])
 			table[m2Hash] = uint32(s - 2)
 			table[currHash] = uint32(s)
 			if debug && s == candidate {
 				panic("s == candidate")
 			}
 			if uint32(x>>16) != load32(src, candidate) {
 				cv = load64(src, s+1)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
 func encodeBlockSnappyGo(dst, src []byte) (d int) {
 	// Initialize the hash table.
 	const (
 		tableBits    = 14
 		maxTableSize = 1 << tableBits
 	)
 	var table [maxTableSize]uint32
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - len(src)>>5 - 5
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
 	repeat := 1
 	for {
 		candidate := 0
 		for {
 			// Next src position to check
 			nextS := s + (s-nextEmit)>>6 + 4
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hash0 := hash6(cv, tableBits)
 			hash1 := hash6(cv>>8, tableBits)
 			candidate = int(table[hash0])
 			candidate2 := int(table[hash1])
 			table[hash0] = uint32(s)
 			table[hash1] = uint32(s + 1)
 			hash2 := hash6(cv>>16, tableBits)
 			// Check repeat at offset checkRep.
 			const checkRep = 1
 			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
 				base := s + checkRep
 				// Extend back
 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
 					i--
 					base--
 				}
 				d += emitLiteral(dst[d:], src[nextEmit:base])
 				// Extend forward
 				candidate := s - repeat + 4 + checkRep
 				s += 4 + checkRep
 				for s <= sLimit {
 					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 						s += bits.TrailingZeros64(diff) >> 3
 						break
 					}
 					s += 8
 					candidate += 8
 				}
 				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
 				nextEmit = s
 				if s >= sLimit {
 					goto emitRemainder
 				}
 				cv = load64(src, s)
 				continue
 			}
 			if uint32(cv) == load32(src, candidate) {
 				break
 			}
 			candidate = int(table[hash2])
 			if uint32(cv>>8) == load32(src, candidate2) {
 				table[hash2] = uint32(s + 2)
 				candidate = candidate2
 				s++
 				break
 			}
 			table[hash2] = uint32(s + 2)
 			if uint32(cv>>16) == load32(src, candidate) {
 				s += 2
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards
 		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
 			candidate--
 			s--
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		d += emitLiteral(dst[d:], src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			base := s
 			repeat = base - candidate
 			// Extend the 4-byte match as long as possible.
 			s += 4
 			candidate += 4
 			for s <= len(src)-8 {
 				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 					s += bits.TrailingZeros64(diff) >> 3
 					break
 				}
 				s += 8
 				candidate += 8
 			}
 			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
 			if false {
 				// Validate match.
 				a := src[base:s]
 				b := src[base-repeat : base-repeat+(s-base)]
 				if !bytes.Equal(a, b) {
 					panic("mismatch")
 				}
 			}
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			if d > dstLimit {
 				// Do we have space for more, if not bail.
 				return 0
 			}
 			// Check for an immediate match, otherwise start search at s+1
 			x := load64(src, s-2)
 			m2Hash := hash6(x, tableBits)
 			currHash := hash6(x>>16, tableBits)
 			candidate = int(table[currHash])
 			table[m2Hash] = uint32(s - 2)
 			table[currHash] = uint32(s)
 			if uint32(x>>16) != load32(src, candidate) {
 				cv = load64(src, s+1)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
--- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@ -0,0 +1,142 @@
 //go:build !appengine && !noasm && gc
 // +build !appengine,!noasm,gc
 package s2
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
 		limit12B = 16 << 10
 		// Use 10 bit table when less than...
 		limit10B = 4 << 10
 		// Use 8 bit table when less than...
 		limit8B = 512
 	)
 	if len(src) >= 4<<20 {
 		return encodeBlockAsm(dst, src)
 	}
 	if len(src) >= limit12B {
 		return encodeBlockAsm4MB(dst, src)
 	}
 	if len(src) >= limit10B {
 		return encodeBlockAsm12B(dst, src)
 	}
 	if len(src) >= limit8B {
 		return encodeBlockAsm10B(dst, src)
 	}
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeBlockAsm8B(dst, src)
 }
 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetter(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
 		limit12B = 16 << 10
 		// Use 10 bit table when less than...
 		limit10B = 4 << 10
 		// Use 8 bit table when less than...
 		limit8B = 512
 	)
 	if len(src) > 4<<20 {
 		return encodeBetterBlockAsm(dst, src)
 	}
 	if len(src) >= limit12B {
 		return encodeBetterBlockAsm4MB(dst, src)
 	}
 	if len(src) >= limit10B {
 		return encodeBetterBlockAsm12B(dst, src)
 	}
 	if len(src) >= limit8B {
 		return encodeBetterBlockAsm10B(dst, src)
 	}
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeBetterBlockAsm8B(dst, src)
 }
 // encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockSnappy(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
 		limit12B = 16 << 10
 		// Use 10 bit table when less than...
 		limit10B = 4 << 10
 		// Use 8 bit table when less than...
 		limit8B = 512
 	)
 	if len(src) >= 64<<10 {
 		return encodeSnappyBlockAsm(dst, src)
 	}
 	if len(src) >= limit12B {
 		return encodeSnappyBlockAsm64K(dst, src)
 	}
 	if len(src) >= limit10B {
 		return encodeSnappyBlockAsm12B(dst, src)
 	}
 	if len(src) >= limit8B {
 		return encodeSnappyBlockAsm10B(dst, src)
 	}
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeSnappyBlockAsm8B(dst, src)
 }
 // encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
 		limit12B = 16 << 10
 		// Use 10 bit table when less than...
 		limit10B = 4 << 10
 		// Use 8 bit table when less than...
 		limit8B = 512
 	)
 	if len(src) >= 64<<10 {
 		return encodeSnappyBetterBlockAsm(dst, src)
 	}
 	if len(src) >= limit12B {
 		return encodeSnappyBetterBlockAsm64K(dst, src)
 	}
 	if len(src) >= limit10B {
 		return encodeSnappyBetterBlockAsm12B(dst, src)
 	}
 	if len(src) >= limit8B {
 		return encodeSnappyBetterBlockAsm10B(dst, src)
 	}
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeSnappyBetterBlockAsm8B(dst, src)
 }
--- a/vendor/github.com/klauspost/compress/s2/encode_best.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@ -0,0 +1,630 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package s2
 import (
 	"fmt"
 	"math/bits"
 )
 // encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBest(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
 		lTableBits    = 19
 		maxLTableSize = 1 << lTableBits
 		// Short hash matches.
 		sTableBits    = 16
 		maxSTableSize = 1 << sTableBits
 		inputMargin = 8 + 2
 	)
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	var lTable [maxLTableSize]uint64
 	var sTable [maxSTableSize]uint64
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - 5
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
 	repeat := 1
 	const lowbitMask = 0xffffffff
 	getCur := func(x uint64) int {
 		return int(x & lowbitMask)
 	}
 	getPrev := func(x uint64) int {
 		return int(x >> 32)
 	}
 	const maxSkip = 64
 	for {
 		type match struct {
 			offset int
 			s      int
 			length int
 			score  int
 			rep    bool
 		}
 		var best match
 		for {
 			// Next src position to check
 			nextS := (s-nextEmit)>>8 + 1
 			if nextS > maxSkip {
 				nextS = s + maxSkip
 			} else {
 				nextS += s
 			}
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hashL := hash8(cv, lTableBits)
 			hashS := hash4(cv, sTableBits)
 			candidateL := lTable[hashL]
 			candidateS := sTable[hashS]
 			score := func(m match) int {
 				// Matches that are longer forward are penalized since we must emit it as a literal.
 				score := m.length - m.s
 				if nextEmit == m.s {
 					// If we do not have to emit literals, we save 1 byte
 					score++
 				}
 				offset := m.s - m.offset
 				if m.rep {
 					return score - emitRepeatSize(offset, m.length)
 				}
 				return score - emitCopySize(offset, m.length)
 			}
 			matchAt := func(offset, s int, first uint32, rep bool) match {
 				if best.length != 0 && best.s-best.offset == s-offset {
 					// Don't retest if we have the same offset.
 					return match{offset: offset, s: s}
 				}
 				if load32(src, offset) != first {
 					return match{offset: offset, s: s}
 				}
 				m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
 				s += 4
 				for s <= sLimit {
 					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
 						m.length += bits.TrailingZeros64(diff) >> 3
 						break
 					}
 					s += 8
 					m.length += 8
 				}
 				m.length -= offset
 				m.score = score(m)
 				if m.score <= -m.s {
 					// Eliminate if no savings, we might find a better one.
 					m.length = 0
 				}
 				return m
 			}
 			bestOf := func(a, b match) match {
 				if b.length == 0 {
 					return a
 				}
 				if a.length == 0 {
 					return b
 				}
 				as := a.score + b.s
 				bs := b.score + a.s
 				if as >= bs {
 					return a
 				}
 				return b
 			}
 			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
 			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
 			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
 			{
 				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
 				if best.length > 0 {
 					// s+1
 					nextShort := sTable[hash4(cv>>8, sTableBits)]
 					s := s + 1
 					cv := load64(src, s)
 					nextLong := lTable[hash8(cv, lTableBits)]
 					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
 					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
 					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
 					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
 					// Repeat at + 2
 					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
 					// s+2
 					if true {
 						nextShort = sTable[hash4(cv>>8, sTableBits)]
 						s++
 						cv = load64(src, s)
 						nextLong = lTable[hash8(cv, lTableBits)]
 						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
 						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
 						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
 						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
 					}
 					// Search for a match at best match end, see if that is better.
 					if sAt := best.s + best.length; sAt < sLimit {
 						sBack := best.s
 						backL := best.length
 						// Load initial values
 						cv = load64(src, sBack)
 						// Search for mismatch
 						next := lTable[hash8(load64(src, sAt), lTableBits)]
 						//next := sTable[hash4(load64(src, sAt), sTableBits)]
 						if checkAt := getCur(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
 						}
 						if checkAt := getPrev(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
 						}
 					}
 				}
 			}
 			// Update table
 			lTable[hashL] = uint64(s) | candidateL<<32
 			sTable[hashS] = uint64(s) | candidateS<<32
 			if best.length > 0 {
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards, not needed for repeats...
 		s = best.s
 		if !best.rep {
 			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
 				best.offset--
 				best.length++
 				s--
 			}
 		}
 		if false && best.offset >= s {
 			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		base := s
 		offset := s - best.offset
 		s += best.length
 		if offset > 65535 && s-base <= 5 && !best.rep {
 			// Bail if the match is equal or worse to the encoding.
 			s = best.s + 1
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			cv = load64(src, s)
 			continue
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:base])
 		if best.rep {
 			if nextEmit > 0 {
 				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
 				d += emitRepeat(dst[d:], offset, best.length)
 			} else {
 				// First match, cannot be repeat.
 				d += emitCopy(dst[d:], offset, best.length)
 			}
 		} else {
 			d += emitCopy(dst[d:], offset, best.length)
 		}
 		repeat = offset
 		nextEmit = s
 		if s >= sLimit {
 			goto emitRemainder
 		}
 		if d > dstLimit {
 			// Do we have space for more, if not bail.
 			return 0
 		}
 		// Fill tables...
 		for i := best.s + 1; i < s; i++ {
 			cv0 := load64(src, i)
 			long0 := hash8(cv0, lTableBits)
 			short0 := hash4(cv0, sTableBits)
 			lTable[long0] = uint64(i) | lTable[long0]<<32
 			sTable[short0] = uint64(i) | sTable[short0]<<32
 		}
 		cv = load64(src, s)
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
 // encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBestSnappy(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
 		lTableBits    = 19
 		maxLTableSize = 1 << lTableBits
 		// Short hash matches.
 		sTableBits    = 16
 		maxSTableSize = 1 << sTableBits
 		inputMargin = 8 + 2
 	)
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	var lTable [maxLTableSize]uint64
 	var sTable [maxSTableSize]uint64
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - 5
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
 	repeat := 1
 	const lowbitMask = 0xffffffff
 	getCur := func(x uint64) int {
 		return int(x & lowbitMask)
 	}
 	getPrev := func(x uint64) int {
 		return int(x >> 32)
 	}
 	const maxSkip = 64
 	for {
 		type match struct {
 			offset int
 			s      int
 			length int
 			score  int
 		}
 		var best match
 		for {
 			// Next src position to check
 			nextS := (s-nextEmit)>>8 + 1
 			if nextS > maxSkip {
 				nextS = s + maxSkip
 			} else {
 				nextS += s
 			}
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hashL := hash8(cv, lTableBits)
 			hashS := hash4(cv, sTableBits)
 			candidateL := lTable[hashL]
 			candidateS := sTable[hashS]
 			score := func(m match) int {
 				// Matches that are longer forward are penalized since we must emit it as a literal.
 				score := m.length - m.s
 				if nextEmit == m.s {
 					// If we do not have to emit literals, we save 1 byte
 					score++
 				}
 				offset := m.s - m.offset
 				return score - emitCopyNoRepeatSize(offset, m.length)
 			}
 			matchAt := func(offset, s int, first uint32) match {
 				if best.length != 0 && best.s-best.offset == s-offset {
 					// Don't retest if we have the same offset.
 					return match{offset: offset, s: s}
 				}
 				if load32(src, offset) != first {
 					return match{offset: offset, s: s}
 				}
 				m := match{offset: offset, s: s, length: 4 + offset}
 				s += 4
 				for s <= sLimit {
 					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
 						m.length += bits.TrailingZeros64(diff) >> 3
 						break
 					}
 					s += 8
 					m.length += 8
 				}
 				m.length -= offset
 				m.score = score(m)
 				if m.score <= -m.s {
 					// Eliminate if no savings, we might find a better one.
 					m.length = 0
 				}
 				return m
 			}
 			bestOf := func(a, b match) match {
 				if b.length == 0 {
 					return a
 				}
 				if a.length == 0 {
 					return b
 				}
 				as := a.score + b.s
 				bs := b.score + a.s
 				if as >= bs {
 					return a
 				}
 				return b
 			}
 			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
 			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
 			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
 			{
 				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
 				if best.length > 0 {
 					// s+1
 					nextShort := sTable[hash4(cv>>8, sTableBits)]
 					s := s + 1
 					cv := load64(src, s)
 					nextLong := lTable[hash8(cv, lTableBits)]
 					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
 					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
 					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
 					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
 					// Repeat at + 2
 					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
 					// s+2
 					if true {
 						nextShort = sTable[hash4(cv>>8, sTableBits)]
 						s++
 						cv = load64(src, s)
 						nextLong = lTable[hash8(cv, lTableBits)]
 						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
 						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
 						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
 						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
 					}
 					// Search for a match at best match end, see if that is better.
 					if sAt := best.s + best.length; sAt < sLimit {
 						sBack := best.s
 						backL := best.length
 						// Load initial values
 						cv = load64(src, sBack)
 						// Search for mismatch
 						next := lTable[hash8(load64(src, sAt), lTableBits)]
 						//next := sTable[hash4(load64(src, sAt), sTableBits)]
 						if checkAt := getCur(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
 						}
 						if checkAt := getPrev(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
 						}
 					}
 				}
 			}
 			// Update table
 			lTable[hashL] = uint64(s) | candidateL<<32
 			sTable[hashS] = uint64(s) | candidateS<<32
 			if best.length > 0 {
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards, not needed for repeats...
 		s = best.s
 		if true {
 			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
 				best.offset--
 				best.length++
 				s--
 			}
 		}
 		if false && best.offset >= s {
 			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		base := s
 		offset := s - best.offset
 		s += best.length
 		if offset > 65535 && s-base <= 5 {
 			// Bail if the match is equal or worse to the encoding.
 			s = best.s + 1
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			cv = load64(src, s)
 			continue
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:base])
 		d += emitCopyNoRepeat(dst[d:], offset, best.length)
 		repeat = offset
 		nextEmit = s
 		if s >= sLimit {
 			goto emitRemainder
 		}
 		if d > dstLimit {
 			// Do we have space for more, if not bail.
 			return 0
 		}
 		// Fill tables...
 		for i := best.s + 1; i < s; i++ {
 			cv0 := load64(src, i)
 			long0 := hash8(cv0, lTableBits)
 			short0 := hash4(cv0, sTableBits)
 			lTable[long0] = uint64(i) | lTable[long0]<<32
 			sTable[short0] = uint64(i) | sTable[short0]<<32
 		}
 		cv = load64(src, s)
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
 // emitCopySize returns the size to encode the offset+length
 //
 // It assumes that:
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopySize(offset, length int) int {
 	if offset >= 65536 {
 		i := 0
 		if length > 64 {
 			length -= 64
 			if length >= 4 {
 				// Emit remaining as repeats
 				return 5 + emitRepeatSize(offset, length)
 			}
 			i = 5
 		}
 		if length == 0 {
 			return i
 		}
 		return i + 5
 	}
 	// Offset no more than 2 bytes.
 	if length > 64 {
 		if offset < 2048 {
 			// Emit 8 bytes, then rest as repeats...
 			return 2 + emitRepeatSize(offset, length-8)
 		}
 		// Emit remaining as repeats, at least 4 bytes remain.
 		return 3 + emitRepeatSize(offset, length-60)
 	}
 	if length >= 12 || offset >= 2048 {
 		return 3
 	}
 	// Emit the remaining copy, encoded as 2 bytes.
 	return 2
 }
 // emitCopyNoRepeatSize returns the size to encode the offset+length
 //
 // It assumes that:
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopyNoRepeatSize(offset, length int) int {
 	if offset >= 65536 {
 		return 5 + 5*(length/64)
 	}
 	// Offset no more than 2 bytes.
 	if length > 64 {
 		// Emit remaining as repeats, at least 4 bytes remain.
 		return 3 + 3*(length/60)
 	}
 	if length >= 12 || offset >= 2048 {
 		return 3
 	}
 	// Emit the remaining copy, encoded as 2 bytes.
 	return 2
 }
 // emitRepeatSize returns the number of bytes required to encode a repeat.
 // Length must be at least 4 and < 1<<24
 func emitRepeatSize(offset, length int) int {
 	// Repeat offset, make length cheaper
 	if length <= 4+4 || (length < 8+4 && offset < 2048) {
 		return 2
 	}
 	if length < (1<<8)+4+4 {
 		return 3
 	}
 	if length < (1<<16)+(1<<8)+4 {
 		return 4
 	}
 	const maxRepeat = (1 << 24) - 1
 	length -= (1 << 16) - 4
 	left := 0
 	if length > maxRepeat {
 		left = length - maxRepeat + 4
 		length = maxRepeat - 4
 	}
 	if left > 0 {
 		return 5 + emitRepeatSize(offset, left)
 	}
 	return 5
 }
--- a/vendor/github.com/klauspost/compress/s2/encode_better.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@ -0,0 +1,431 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package s2
 import (
 	"math/bits"
 )
 // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4(u uint64, h uint8) uint32 {
 	const prime4bytes = 2654435761
 	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
 }
 // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash5(u uint64, h uint8) uint32 {
 	const prime5bytes = 889523592379
 	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
 }
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	const prime7bytes = 58295818150454627
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
 }
 // hash8 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash8(u uint64, h uint8) uint32 {
 	const prime8bytes = 0xcf1bbcdcb7a56463
 	return uint32((u * prime8bytes) >> ((64 - h) & 63))
 }
 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterGo(dst, src []byte) (d int) {
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
 		lTableBits    = 16
 		maxLTableSize = 1 << lTableBits
 		// Short hash matches.
 		sTableBits    = 14
 		maxSTableSize = 1 << sTableBits
 	)
 	var lTable [maxLTableSize]uint32
 	var sTable [maxSTableSize]uint32
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - len(src)>>5 - 6
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We initialize repeat to 0, so we never match on first attempt
 	repeat := 0
 	for {
 		candidateL := 0
 		nextS := 0
 		for {
 			// Next src position to check
 			nextS = s + (s-nextEmit)>>7 + 1
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hashL := hash7(cv, lTableBits)
 			hashS := hash4(cv, sTableBits)
 			candidateL = int(lTable[hashL])
 			candidateS := int(sTable[hashS])
 			lTable[hashL] = uint32(s)
 			sTable[hashS] = uint32(s)
 			// Check repeat at offset checkRep.
 			const checkRep = 1
 			if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
 				base := s + checkRep
 				// Extend back
 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
 					i--
 					base--
 				}
 				d += emitLiteral(dst[d:], src[nextEmit:base])
 				// Extend forward
 				candidate := s - repeat + 4 + checkRep
 				s += 4 + checkRep
 				for s < len(src) {
 					if len(src)-s < 8 {
 						if src[s] == src[candidate] {
 							s++
 							candidate++
 							continue
 						}
 						break
 					}
 					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 						s += bits.TrailingZeros64(diff) >> 3
 						break
 					}
 					s += 8
 					candidate += 8
 				}
 				if nextEmit > 0 {
 					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
 					d += emitRepeat(dst[d:], repeat, s-base)
 				} else {
 					// First match, cannot be repeat.
 					d += emitCopy(dst[d:], repeat, s-base)
 				}
 				nextEmit = s
 				if s >= sLimit {
 					goto emitRemainder
 				}
 				cv = load64(src, s)
 				continue
 			}
 			if uint32(cv) == load32(src, candidateL) {
 				break
 			}
 			// Check our short candidate
 			if uint32(cv) == load32(src, candidateS) {
 				// Try a long candidate at s+1
 				hashL = hash7(cv>>8, lTableBits)
 				candidateL = int(lTable[hashL])
 				lTable[hashL] = uint32(s + 1)
 				if uint32(cv>>8) == load32(src, candidateL) {
 					s++
 					break
 				}
 				// Use our short candidate.
 				candidateL = candidateS
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards
 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 			candidateL--
 			s--
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		base := s
 		offset := base - candidateL
 		// Extend the 4-byte match as long as possible.
 		s += 4
 		candidateL += 4
 		for s < len(src) {
 			if len(src)-s < 8 {
 				if src[s] == src[candidateL] {
 					s++
 					candidateL++
 					continue
 				}
 				break
 			}
 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 				s += bits.TrailingZeros64(diff) >> 3
 				break
 			}
 			s += 8
 			candidateL += 8
 		}
 		if offset > 65535 && s-base <= 5 && repeat != offset {
 			// Bail if the match is equal or worse to the encoding.
 			s = nextS + 1
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			cv = load64(src, s)
 			continue
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:base])
 		if repeat == offset {
 			d += emitRepeat(dst[d:], offset, s-base)
 		} else {
 			d += emitCopy(dst[d:], offset, s-base)
 			repeat = offset
 		}
 		nextEmit = s
 		if s >= sLimit {
 			goto emitRemainder
 		}
 		if d > dstLimit {
 			// Do we have space for more, if not bail.
 			return 0
 		}
 		// Index match start+1 (long) and start+2 (short)
 		index0 := base + 1
 		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
 		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
 		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
 		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
 // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
 		lTableBits    = 16
 		maxLTableSize = 1 << lTableBits
 		// Short hash matches.
 		sTableBits    = 14
 		maxSTableSize = 1 << sTableBits
 	)
 	var lTable [maxLTableSize]uint32
 	var sTable [maxSTableSize]uint32
 	// Bail if we can't compress to at least this.
 	dstLimit := len(src) - len(src)>>5 - 6
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	cv := load64(src, s)
 	// We initialize repeat to 0, so we never match on first attempt
 	repeat := 0
 	const maxSkip = 100
 	for {
 		candidateL := 0
 		nextS := 0
 		for {
 			// Next src position to check
 			nextS = (s-nextEmit)>>7 + 1
 			if nextS > maxSkip {
 				nextS = s + maxSkip
 			} else {
 				nextS += s
 			}
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			hashL := hash7(cv, lTableBits)
 			hashS := hash4(cv, sTableBits)
 			candidateL = int(lTable[hashL])
 			candidateS := int(sTable[hashS])
 			lTable[hashL] = uint32(s)
 			sTable[hashS] = uint32(s)
 			if uint32(cv) == load32(src, candidateL) {
 				break
 			}
 			// Check our short candidate
 			if uint32(cv) == load32(src, candidateS) {
 				// Try a long candidate at s+1
 				hashL = hash7(cv>>8, lTableBits)
 				candidateL = int(lTable[hashL])
 				lTable[hashL] = uint32(s + 1)
 				if uint32(cv>>8) == load32(src, candidateL) {
 					s++
 					break
 				}
 				// Use our short candidate.
 				candidateL = candidateS
 				break
 			}
 			cv = load64(src, nextS)
 			s = nextS
 		}
 		// Extend backwards
 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 			candidateL--
 			s--
 		}
 		// Bail if we exceed the maximum size.
 		if d+(s-nextEmit) > dstLimit {
 			return 0
 		}
 		base := s
 		offset := base - candidateL
 		// Extend the 4-byte match as long as possible.
 		s += 4
 		candidateL += 4
 		for s < len(src) {
 			if len(src)-s < 8 {
 				if src[s] == src[candidateL] {
 					s++
 					candidateL++
 					continue
 				}
 				break
 			}
 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 				s += bits.TrailingZeros64(diff) >> 3
 				break
 			}
 			s += 8
 			candidateL += 8
 		}
 		if offset > 65535 && s-base <= 5 && repeat != offset {
 			// Bail if the match is equal or worse to the encoding.
 			s = nextS + 1
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			cv = load64(src, s)
 			continue
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:base])
 		d += emitCopyNoRepeat(dst[d:], offset, s-base)
 		repeat = offset
 		nextEmit = s
 		if s >= sLimit {
 			goto emitRemainder
 		}
 		if d > dstLimit {
 			// Do we have space for more, if not bail.
 			return 0
 		}
 		// Index match start+1 (long) and start+2 (short)
 		index0 := base + 1
 		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
 		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
 		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
 		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		// Bail if we exceed the maximum size.
 		if d+len(src)-nextEmit > dstLimit {
 			return 0
 		}
 		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@ -0,0 +1,307 @@
 //go:build !amd64 || appengine || !gc || noasm
 // +build !amd64 appengine !gc noasm
 package s2
 import (
 	"math/bits"
 )
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlock(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeBlockGo(dst, src)
 }
 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetter(dst, src []byte) (d int) {
 	return encodeBlockBetterGo(dst, src)
 }
 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
 	return encodeBlockBetterSnappyGo(dst, src)
 }
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockSnappy(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
 	return encodeBlockSnappyGo(dst, src)
 }
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
 //	dst is long enough to hold the encoded bytes
 //	0 <= len(lit) && len(lit) <= math.MaxUint32
 func emitLiteral(dst, lit []byte) int {
 	if len(lit) == 0 {
 		return 0
 	}
 	const num = 63<<2 | tagLiteral
 	i, n := 0, uint(len(lit)-1)
 	switch {
 	case n < 60:
 		dst[0] = uint8(n)<<2 | tagLiteral
 		i = 1
 	case n < 1<<8:
 		dst[1] = uint8(n)
 		dst[0] = 60<<2 | tagLiteral
 		i = 2
 	case n < 1<<16:
 		dst[2] = uint8(n >> 8)
 		dst[1] = uint8(n)
 		dst[0] = 61<<2 | tagLiteral
 		i = 3
 	case n < 1<<24:
 		dst[3] = uint8(n >> 16)
 		dst[2] = uint8(n >> 8)
 		dst[1] = uint8(n)
 		dst[0] = 62<<2 | tagLiteral
 		i = 4
 	default:
 		dst[4] = uint8(n >> 24)
 		dst[3] = uint8(n >> 16)
 		dst[2] = uint8(n >> 8)
 		dst[1] = uint8(n)
 		dst[0] = 63<<2 | tagLiteral
 		i = 5
 	}
 	return i + copy(dst[i:], lit)
 }
 // emitRepeat writes a repeat chunk and returns the number of bytes written.
 // Length must be at least 4 and < 1<<24
 func emitRepeat(dst []byte, offset, length int) int {
 	// Repeat offset, make length cheaper
 	length -= 4
 	if length <= 4 {
 		dst[0] = uint8(length)<<2 | tagCopy1
 		dst[1] = 0
 		return 2
 	}
 	if length < 8 && offset < 2048 {
 		// Encode WITH offset
 		dst[1] = uint8(offset)
 		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
 		return 2
 	}
 	if length < (1<<8)+4 {
 		length -= 4
 		dst[2] = uint8(length)
 		dst[1] = 0
 		dst[0] = 5<<2 | tagCopy1
 		return 3
 	}
 	if length < (1<<16)+(1<<8) {
 		length -= 1 << 8
 		dst[3] = uint8(length >> 8)
 		dst[2] = uint8(length >> 0)
 		dst[1] = 0
 		dst[0] = 6<<2 | tagCopy1
 		return 4
 	}
 	const maxRepeat = (1 << 24) - 1
 	length -= 1 << 16
 	left := 0
 	if length > maxRepeat {
 		left = length - maxRepeat + 4
 		length = maxRepeat - 4
 	}
 	dst[4] = uint8(length >> 16)
 	dst[3] = uint8(length >> 8)
 	dst[2] = uint8(length >> 0)
 	dst[1] = 0
 	dst[0] = 7<<2 | tagCopy1
 	if left > 0 {
 		return 5 + emitRepeat(dst[5:], offset, left)
 	}
 	return 5
 }
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopy(dst []byte, offset, length int) int {
 	if offset >= 65536 {
 		i := 0
 		if length > 64 {
 			// Emit a length 64 copy, encoded as 5 bytes.
 			dst[4] = uint8(offset >> 24)
 			dst[3] = uint8(offset >> 16)
 			dst[2] = uint8(offset >> 8)
 			dst[1] = uint8(offset)
 			dst[0] = 63<<2 | tagCopy4
 			length -= 64
 			if length >= 4 {
 				// Emit remaining as repeats
 				return 5 + emitRepeat(dst[5:], offset, length)
 			}
 			i = 5
 		}
 		if length == 0 {
 			return i
 		}
 		// Emit a copy, offset encoded as 4 bytes.
 		dst[i+0] = uint8(length-1)<<2 | tagCopy4
 		dst[i+1] = uint8(offset)
 		dst[i+2] = uint8(offset >> 8)
 		dst[i+3] = uint8(offset >> 16)
 		dst[i+4] = uint8(offset >> 24)
 		return i + 5
 	}
 	// Offset no more than 2 bytes.
 	if length > 64 {
 		off := 3
 		if offset < 2048 {
 			// emit 8 bytes as tagCopy1, rest as repeats.
 			dst[1] = uint8(offset)
 			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
 			length -= 8
 			off = 2
 		} else {
 			// Emit a length 60 copy, encoded as 3 bytes.
 			// Emit remaining as repeat value (minimum 4 bytes).
 			dst[2] = uint8(offset >> 8)
 			dst[1] = uint8(offset)
 			dst[0] = 59<<2 | tagCopy2
 			length -= 60
 		}
 		// Emit remaining as repeats, at least 4 bytes remain.
 		return off + emitRepeat(dst[off:], offset, length)
 	}
 	if length >= 12 || offset >= 2048 {
 		// Emit the remaining copy, encoded as 3 bytes.
 		dst[2] = uint8(offset >> 8)
 		dst[1] = uint8(offset)
 		dst[0] = uint8(length-1)<<2 | tagCopy2
 		return 3
 	}
 	// Emit the remaining copy, encoded as 2 bytes.
 	dst[1] = uint8(offset)
 	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
 	return 2
 }
 // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopyNoRepeat(dst []byte, offset, length int) int {
 	if offset >= 65536 {
 		i := 0
 		if length > 64 {
 			// Emit a length 64 copy, encoded as 5 bytes.
 			dst[4] = uint8(offset >> 24)
 			dst[3] = uint8(offset >> 16)
 			dst[2] = uint8(offset >> 8)
 			dst[1] = uint8(offset)
 			dst[0] = 63<<2 | tagCopy4
 			length -= 64
 			if length >= 4 {
 				// Emit remaining as repeats
 				return 5 + emitCopyNoRepeat(dst[5:], offset, length)
 			}
 			i = 5
 		}
 		if length == 0 {
 			return i
 		}
 		// Emit a copy, offset encoded as 4 bytes.
 		dst[i+0] = uint8(length-1)<<2 | tagCopy4
 		dst[i+1] = uint8(offset)
 		dst[i+2] = uint8(offset >> 8)
 		dst[i+3] = uint8(offset >> 16)
 		dst[i+4] = uint8(offset >> 24)
 		return i + 5
 	}
 	// Offset no more than 2 bytes.
 	if length > 64 {
 		// Emit a length 60 copy, encoded as 3 bytes.
 		// Emit remaining as repeat value (minimum 4 bytes).
 		dst[2] = uint8(offset >> 8)
 		dst[1] = uint8(offset)
 		dst[0] = 59<<2 | tagCopy2
 		length -= 60
 		// Emit remaining as repeats, at least 4 bytes remain.
 		return 3 + emitCopyNoRepeat(dst[3:], offset, length)
 	}
 	if length >= 12 || offset >= 2048 {
 		// Emit the remaining copy, encoded as 3 bytes.
 		dst[2] = uint8(offset >> 8)
 		dst[1] = uint8(offset)
 		dst[0] = uint8(length-1)<<2 | tagCopy2
 		return 3
 	}
 	// Emit the remaining copy, encoded as 2 bytes.
 	dst[1] = uint8(offset)
 	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
 	return 2
 }
 // matchLen returns how many bytes match in a and b
 //
 // It assumes that:
 //   len(a) <= len(b)
 //
 func matchLen(a []byte, b []byte) int {
 	b = b[:len(a)]
 	var checked int
 	if len(a) > 4 {
 		// Try 4 bytes first
 		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
 			return bits.TrailingZeros32(diff) >> 3
 		}
 		// Switch to 8 byte matching.
 		checked = 4
 		a = a[4:]
 		b = b[4:]
 		for len(a) >= 8 {
 			b = b[:len(a)]
 			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
 				return checked + (bits.TrailingZeros64(diff) >> 3)
 			}
 			checked += 8
 			a = a[8:]
 			b = b[8:]
 		}
 	}
 	b = b[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
 			return int(i) + checked
 		}
 	}
 	return len(a) + checked
 }
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@ -0,0 +1,191 @@
 // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
 //go:build !appengine && !noasm && gc && !noasm
 // +build !appengine,!noasm,gc,!noasm
 package s2
 func _dummy_()
 // encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4294967295 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBlockAsm(dst []byte, src []byte) int
 // encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4194304 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBlockAsm4MB(dst []byte, src []byte) int
 // encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 16383 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBlockAsm12B(dst []byte, src []byte) int
 // encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4095 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBlockAsm10B(dst []byte, src []byte) int
 // encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 511 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBlockAsm8B(dst []byte, src []byte) int
 // encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4294967295 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBetterBlockAsm(dst []byte, src []byte) int
 // encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4194304 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
 // encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 16383 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBetterBlockAsm12B(dst []byte, src []byte) int
 // encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4095 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBetterBlockAsm10B(dst []byte, src []byte) int
 // encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 511 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeBetterBlockAsm8B(dst []byte, src []byte) int
 // encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4294967295 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBlockAsm(dst []byte, src []byte) int
 // encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 65535 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
 // encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 16383 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
 // encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4095 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
 // encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 511 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
 // encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4294967295 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
 // encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 65535 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
 // encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 16383 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
 // encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 4095 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
 // encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
 // Maximum input 511 bytes.
 // It assumes that the varint-encoded length of the decompressed bytes has already been written.
 //
 //go:noescape
 func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
 //   dst is long enough to hold the encoded bytes with margin of 0 bytes
 //   0 <= len(lit) && len(lit) <= math.MaxUint32
 //
 //go:noescape
 func emitLiteral(dst []byte, lit []byte) int
 // emitRepeat writes a repeat chunk and returns the number of bytes written.
 // Length must be at least 4 and < 1<<32
 //
 //go:noescape
 func emitRepeat(dst []byte, offset int, length int) int
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
 //   dst is long enough to hold the encoded bytes
 //   1 <= offset && offset <= math.MaxUint32
 //   4 <= length && length <= 1 << 24
 //
 //go:noescape
 func emitCopy(dst []byte, offset int, length int) int
 // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
 //   dst is long enough to hold the encoded bytes
 //   1 <= offset && offset <= math.MaxUint32
 //   4 <= length && length <= 1 << 24
 //
 //go:noescape
 func emitCopyNoRepeat(dst []byte, offset int, length int) int
 // matchLen returns how many bytes match in a and b
 //
 // It assumes that:
 //   len(a) <= len(b)
 //
 //go:noescape
 func matchLen(a []byte, b []byte) int
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
--- a/vendor/github.com/klauspost/compress/s2/index.go
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@ -0,0 +1,598 @@
 // Copyright (c) 2022+ Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package s2
 import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"sort"
 )
 const (
 	S2IndexHeader   = "s2idx\x00"
 	S2IndexTrailer  = "\x00xdi2s"
 	maxIndexEntries = 1 << 16
 )
 // Index represents an S2/Snappy index.
 type Index struct {
 	TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
 	TotalCompressed   int64 // Total Compressed size if known. Will be -1 if unknown.
 	info              []struct {
 		compressedOffset   int64
 		uncompressedOffset int64
 	}
 	estBlockUncomp int64
 }
 func (i *Index) reset(maxBlock int) {
 	i.estBlockUncomp = int64(maxBlock)
 	i.TotalCompressed = -1
 	i.TotalUncompressed = -1
 	if len(i.info) > 0 {
 		i.info = i.info[:0]
 	}
 }
 // allocInfos will allocate an empty slice of infos.
 func (i *Index) allocInfos(n int) {
 	if n > maxIndexEntries {
 		panic("n > maxIndexEntries")
 	}
 	i.info = make([]struct {
 		compressedOffset   int64
 		uncompressedOffset int64
 	}, 0, n)
 }
 // add an uncompressed and compressed pair.
 // Entries must be sent in order.
 func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
 	if i == nil {
 		return nil
 	}
 	lastIdx := len(i.info) - 1
 	if lastIdx >= 0 {
 		latest := i.info[lastIdx]
 		if latest.uncompressedOffset == uncompressedOffset {
 			// Uncompressed didn't change, don't add entry,
 			// but update start index.
 			latest.compressedOffset = compressedOffset
 			i.info[lastIdx] = latest
 			return nil
 		}
 		if latest.uncompressedOffset > uncompressedOffset {
 			return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
 		}
 		if latest.compressedOffset > compressedOffset {
 			return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
 		}
 	}
 	i.info = append(i.info, struct {
 		compressedOffset   int64
 		uncompressedOffset int64
 	}{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
 	return nil
 }
 // Find the offset at or before the wanted (uncompressed) offset.
 // If offset is 0 or positive it is the offset from the beginning of the file.
 // If the uncompressed size is known, the offset must be within the file.
 // If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
 // If the offset is negative, it is interpreted as the distance from the end of the file,
 // where -1 represents the last byte.
 // If offset from the end of the file is requested, but size is unknown,
 // ErrUnsupported will be returned.
 func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
 	if i.TotalUncompressed < 0 {
 		return 0, 0, ErrCorrupt
 	}
 	if offset < 0 {
 		offset = i.TotalUncompressed + offset
 		if offset < 0 {
 			return 0, 0, io.ErrUnexpectedEOF
 		}
 	}
 	if offset > i.TotalUncompressed {
 		return 0, 0, io.ErrUnexpectedEOF
 	}
 	if len(i.info) > 200 {
 		n := sort.Search(len(i.info), func(n int) bool {
 			return i.info[n].uncompressedOffset > offset
 		})
 		if n == 0 {
 			n = 1
 		}
 		return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
 	}
 	for _, info := range i.info {
 		if info.uncompressedOffset > offset {
 			break
 		}
 		compressedOff = info.compressedOffset
 		uncompressedOff = info.uncompressedOffset
 	}
 	return compressedOff, uncompressedOff, nil
 }
 // reduce to stay below maxIndexEntries
 func (i *Index) reduce() {
 	if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
 		return
 	}
 	// Algorithm, keep 1, remove removeN entries...
 	removeN := (len(i.info) + 1) / maxIndexEntries
 	src := i.info
 	j := 0
 	// Each block should be at least 1MB, but don't reduce below 1000 entries.
 	for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
 		removeN++
 	}
 	for idx := 0; idx < len(src); idx++ {
 		i.info[j] = src[idx]
 		j++
 		idx += removeN
 	}
 	i.info = i.info[:j]
 	// Update maxblock estimate.
 	i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
 }
 func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
 	i.reduce()
 	var tmp [binary.MaxVarintLen64]byte
 	initSize := len(b)
 	// We make the start a skippable header+size.
 	b = append(b, ChunkTypeIndex, 0, 0, 0)
 	b = append(b, []byte(S2IndexHeader)...)
 	// Total Uncompressed size
 	n := binary.PutVarint(tmp[:], uncompTotal)
 	b = append(b, tmp[:n]...)
 	// Total Compressed size
 	n = binary.PutVarint(tmp[:], compTotal)
 	b = append(b, tmp[:n]...)
 	// Put EstBlockUncomp size
 	n = binary.PutVarint(tmp[:], i.estBlockUncomp)
 	b = append(b, tmp[:n]...)
 	// Put length
 	n = binary.PutVarint(tmp[:], int64(len(i.info)))
 	b = append(b, tmp[:n]...)
 	// Check if we should add uncompressed offsets
 	var hasUncompressed byte
 	for idx, info := range i.info {
 		if idx == 0 {
 			if info.uncompressedOffset != 0 {
 				hasUncompressed = 1
 				break
 			}
 			continue
 		}
 		if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
 			hasUncompressed = 1
 			break
 		}
 	}
 	b = append(b, hasUncompressed)
 	// Add each entry
 	if hasUncompressed == 1 {
 		for idx, info := range i.info {
 			uOff := info.uncompressedOffset
 			if idx > 0 {
 				prev := i.info[idx-1]
 				uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
 			}
 			n = binary.PutVarint(tmp[:], uOff)
 			b = append(b, tmp[:n]...)
 		}
 	}
 	// Initial compressed size estimate.
 	cPredict := i.estBlockUncomp / 2
 	for idx, info := range i.info {
 		cOff := info.compressedOffset
 		if idx > 0 {
 			prev := i.info[idx-1]
 			cOff -= prev.compressedOffset + cPredict
 			// Update compressed size prediction, with half the error.
 			cPredict += cOff / 2
 		}
 		n = binary.PutVarint(tmp[:], cOff)
 		b = append(b, tmp[:n]...)
 	}
 	// Add Total Size.
 	// Stored as fixed size for easier reading.
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
 	b = append(b, tmp[:4]...)
 	// Trailer
 	b = append(b, []byte(S2IndexTrailer)...)
 	// Update size
 	chunkLen := len(b) - initSize - skippableFrameHeader
 	b[initSize+1] = uint8(chunkLen >> 0)
 	b[initSize+2] = uint8(chunkLen >> 8)
 	b[initSize+3] = uint8(chunkLen >> 16)
 	//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
 	return b
 }
 // Load a binary index.
 // A zero value Index can be used or a previous one can be reused.
 func (i *Index) Load(b []byte) ([]byte, error) {
 	if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
 		return b, io.ErrUnexpectedEOF
 	}
 	if b[0] != ChunkTypeIndex {
 		return b, ErrCorrupt
 	}
 	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
 	b = b[4:]
 	// Validate we have enough...
 	if len(b) < chunkLen {
 		return b, io.ErrUnexpectedEOF
 	}
 	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
 		return b, ErrUnsupported
 	}
 	b = b[len(S2IndexHeader):]
 	// Total Uncompressed
 	if v, n := binary.Varint(b); n <= 0 || v < 0 {
 		return b, ErrCorrupt
 	} else {
 		i.TotalUncompressed = v
 		b = b[n:]
 	}
 	// Total Compressed
 	if v, n := binary.Varint(b); n <= 0 {
 		return b, ErrCorrupt
 	} else {
 		i.TotalCompressed = v
 		b = b[n:]
 	}
 	// Read EstBlockUncomp
 	if v, n := binary.Varint(b); n <= 0 {
 		return b, ErrCorrupt
 	} else {
 		if v < 0 {
 			return b, ErrCorrupt
 		}
 		i.estBlockUncomp = v
 		b = b[n:]
 	}
 	var entries int
 	if v, n := binary.Varint(b); n <= 0 {
 		return b, ErrCorrupt
 	} else {
 		if v < 0 || v > maxIndexEntries {
 			return b, ErrCorrupt
 		}
 		entries = int(v)
 		b = b[n:]
 	}
 	if cap(i.info) < entries {
 		i.allocInfos(entries)
 	}
 	i.info = i.info[:entries]
 	if len(b) < 1 {
 		return b, io.ErrUnexpectedEOF
 	}
 	hasUncompressed := b[0]
 	b = b[1:]
 	if hasUncompressed&1 != hasUncompressed {
 		return b, ErrCorrupt
 	}
 	// Add each uncompressed entry
 	for idx := range i.info {
 		var uOff int64
 		if hasUncompressed != 0 {
 			// Load delta
 			if v, n := binary.Varint(b); n <= 0 {
 				return b, ErrCorrupt
 			} else {
 				uOff = v
 				b = b[n:]
 			}
 		}
 		if idx > 0 {
 			prev := i.info[idx-1].uncompressedOffset
 			uOff += prev + (i.estBlockUncomp)
 			if uOff <= prev {
 				return b, ErrCorrupt
 			}
 		}
 		if uOff < 0 {
 			return b, ErrCorrupt
 		}
 		i.info[idx].uncompressedOffset = uOff
 	}
 	// Initial compressed size estimate.
 	cPredict := i.estBlockUncomp / 2
 	// Add each compressed entry
 	for idx := range i.info {
 		var cOff int64
 		if v, n := binary.Varint(b); n <= 0 {
 			return b, ErrCorrupt
 		} else {
 			cOff = v
 			b = b[n:]
 		}
 		if idx > 0 {
 			// Update compressed size prediction, with half the error.
 			cPredictNew := cPredict + cOff/2
 			prev := i.info[idx-1].compressedOffset
 			cOff += prev + cPredict
 			if cOff <= prev {
 				return b, ErrCorrupt
 			}
 			cPredict = cPredictNew
 		}
 		if cOff < 0 {
 			return b, ErrCorrupt
 		}
 		i.info[idx].compressedOffset = cOff
 	}
 	if len(b) < 4+len(S2IndexTrailer) {
 		return b, io.ErrUnexpectedEOF
 	}
 	// Skip size...
 	b = b[4:]
 	// Check trailer...
 	if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
 		return b, ErrCorrupt
 	}
 	return b[len(S2IndexTrailer):], nil
 }
 // LoadStream will load an index from the end of the supplied stream.
 // ErrUnsupported will be returned if the signature cannot be found.
 // ErrCorrupt will be returned if unexpected values are found.
 // io.ErrUnexpectedEOF is returned if there are too few bytes.
 // IO errors are returned as-is.
 func (i *Index) LoadStream(rs io.ReadSeeker) error {
 	// Go to end.
 	_, err := rs.Seek(-10, io.SeekEnd)
 	if err != nil {
 		return err
 	}
 	var tmp [10]byte
 	_, err = io.ReadFull(rs, tmp[:])
 	if err != nil {
 		return err
 	}
 	// Check trailer...
 	if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
 		return ErrUnsupported
 	}
 	sz := binary.LittleEndian.Uint32(tmp[:4])
 	if sz > maxChunkSize+skippableFrameHeader {
 		return ErrCorrupt
 	}
 	_, err = rs.Seek(-int64(sz), io.SeekEnd)
 	if err != nil {
 		return err
 	}
 	// Read index.
 	buf := make([]byte, sz)
 	_, err = io.ReadFull(rs, buf)
 	if err != nil {
 		return err
 	}
 	_, err = i.Load(buf)
 	return err
 }
 // IndexStream will return an index for a stream.
 // The stream structure will be checked, but
 // data within blocks is not verified.
 // The returned index can either be appended to the end of the stream
 // or stored separately.
 func IndexStream(r io.Reader) ([]byte, error) {
 	var i Index
 	var buf [maxChunkSize]byte
 	var readHeader bool
 	for {
 		_, err := io.ReadFull(r, buf[:4])
 		if err != nil {
 			if err == io.EOF {
 				return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
 			}
 			return nil, err
 		}
 		// Start of this chunk.
 		startChunk := i.TotalCompressed
 		i.TotalCompressed += 4
 		chunkType := buf[0]
 		if !readHeader {
 			if chunkType != chunkTypeStreamIdentifier {
 				return nil, ErrCorrupt
 			}
 			readHeader = true
 		}
 		chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
 		if chunkLen < checksumSize {
 			return nil, ErrCorrupt
 		}
 		i.TotalCompressed += int64(chunkLen)
 		_, err = io.ReadFull(r, buf[:chunkLen])
 		if err != nil {
 			return nil, io.ErrUnexpectedEOF
 		}
 		// The chunk types are specified at
 		// https://github.com/google/snappy/blob/master/framing_format.txt
 		switch chunkType {
 		case chunkTypeCompressedData:
 			// Section 4.2. Compressed data (chunk type 0x00).
 			// Skip checksum.
 			dLen, err := DecodedLen(buf[checksumSize:])
 			if err != nil {
 				return nil, err
 			}
 			if dLen > maxBlockSize {
 				return nil, ErrCorrupt
 			}
 			if i.estBlockUncomp == 0 {
 				// Use first block for estimate...
 				i.estBlockUncomp = int64(dLen)
 			}
 			err = i.add(startChunk, i.TotalUncompressed)
 			if err != nil {
 				return nil, err
 			}
 			i.TotalUncompressed += int64(dLen)
 			continue
 		case chunkTypeUncompressedData:
 			n2 := chunkLen - checksumSize
 			if n2 > maxBlockSize {
 				return nil, ErrCorrupt
 			}
 			if i.estBlockUncomp == 0 {
 				// Use first block for estimate...
 				i.estBlockUncomp = int64(n2)
 			}
 			err = i.add(startChunk, i.TotalUncompressed)
 			if err != nil {
 				return nil, err
 			}
 			i.TotalUncompressed += int64(n2)
 			continue
 		case chunkTypeStreamIdentifier:
 			// Section 4.1. Stream identifier (chunk type 0xff).
 			if chunkLen != len(magicBody) {
 				return nil, ErrCorrupt
 			}
 			if string(buf[:len(magicBody)]) != magicBody {
 				if string(buf[:len(magicBody)]) != magicBodySnappy {
 					return nil, ErrCorrupt
 				}
 			}
 			continue
 		}
 		if chunkType <= 0x7f {
 			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
 			return nil, ErrUnsupported
 		}
 		if chunkLen > maxChunkSize {
 			return nil, ErrUnsupported
 		}
 		// Section 4.4 Padding (chunk type 0xfe).
 		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
 	}
 }
 // JSON returns the index as JSON text.
 func (i *Index) JSON() []byte {
 	x := struct {
 		TotalUncompressed int64 `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
 		TotalCompressed   int64 `json:"total_compressed"`   // Total Compressed size if known. Will be -1 if unknown.
 		Offsets           []struct {
 			CompressedOffset   int64 `json:"compressed"`
 			UncompressedOffset int64 `json:"uncompressed"`
 		} `json:"offsets"`
 		EstBlockUncomp int64 `json:"est_block_uncompressed"`
 	}{
 		TotalUncompressed: i.TotalUncompressed,
 		TotalCompressed:   i.TotalCompressed,
 		EstBlockUncomp:    i.estBlockUncomp,
 	}
 	for _, v := range i.info {
 		x.Offsets = append(x.Offsets, struct {
 			CompressedOffset   int64 `json:"compressed"`
 			UncompressedOffset int64 `json:"uncompressed"`
 		}{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
 	}
 	b, _ := json.MarshalIndent(x, "", "  ")
 	return b
 }
 // RemoveIndexHeaders will trim all headers and trailers from a given index.
 // This is expected to save 20 bytes.
 // These can be restored using RestoreIndexHeaders.
 // This removes a layer of security, but is the most compact representation.
 // Returns nil if headers contains errors.
 // The returned slice references the provided slice.
 func RemoveIndexHeaders(b []byte) []byte {
 	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
 	if len(b) <= save {
 		return nil
 	}
 	if b[0] != ChunkTypeIndex {
 		return nil
 	}
 	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
 	b = b[4:]
 	// Validate we have enough...
 	if len(b) < chunkLen {
 		return nil
 	}
 	b = b[:chunkLen]
 	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
 		return nil
 	}
 	b = b[len(S2IndexHeader):]
 	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
 		return nil
 	}
 	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
 	if len(b) < 4 {
 		return nil
 	}
 	return b[:len(b)-4]
 }
 // RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
 // No error checking is performed on the input.
 // If a 0 length slice is sent, it is returned without modification.
 func RestoreIndexHeaders(in []byte) []byte {
 	if len(in) == 0 {
 		return in
 	}
 	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
 	b = append(b, ChunkTypeIndex, 0, 0, 0)
 	b = append(b, []byte(S2IndexHeader)...)
 	b = append(b, in...)
 	var tmp [4]byte
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
 	b = append(b, tmp[:4]...)
 	// Trailer
 	b = append(b, []byte(S2IndexTrailer)...)
 	chunkLen := len(b) - skippableFrameHeader
 	b[1] = uint8(chunkLen >> 0)
 	b[2] = uint8(chunkLen >> 8)
 	b[3] = uint8(chunkLen >> 16)
 	return b
 }
--- a/vendor/github.com/klauspost/compress/s2/s2.go
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@ -0,0 +1,143 @@
 // Copyright 2011 The Snappy-Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package s2 implements the S2 compression format.
 //
 // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
 // which is why it features concurrent compression for bigger payloads.
 //
 // Decoding is compatible with Snappy compressed content,
 // but content compressed with S2 cannot be decompressed by Snappy.
 //
 // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
 //
 // There are actually two S2 formats: block and stream. They are related,
 // but different: trying to decompress block-compressed data as a S2 stream
 // will fail, and vice versa. The block format is the Decode and Encode
 // functions and the stream format is the Reader and Writer types.
 //
 // A "better" compression option is available. This will trade some compression
 // speed
 //
 // The block format, the more common case, is used when the complete size (the
 // number of bytes) of the original data is known upfront, at the time
 // compression starts. The stream format, also known as the framing format, is
 // for when that isn't always true.
 //
 // Blocks to not offer much data protection, so it is up to you to
 // add data validation of decompressed blocks.
 //
 // Streams perform CRC validation of the decompressed data.
 // Stream compression will also be performed on multiple CPU cores concurrently
 // significantly improving throughput.
 package s2
 import (
 	"bytes"
 	"hash/crc32"
 )
 /*
 Each encoded block begins with the varint-encoded length of the decoded data,
 followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
 first byte of each chunk is broken into its 2 least and 6 most significant bits
 called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
 Zero means a literal tag. All other values mean a copy tag.
 For literal tags:
  - If m < 60, the next 1 + m bytes are literal bytes.
  - Otherwise, let n be the little-endian unsigned integer denoted by the next
    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
 For copy tags, length bytes are copied from offset bytes ago, in the style of
 Lempel-Ziv compression algorithms. In particular:
  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
    of the offset. The next byte is bits 0-7 of the offset.
  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
    The length is 1 + m. The offset is the little-endian unsigned integer
    denoted by the next 2 bytes.
  - For l == 3, the offset ranges in [0, 1<<32) and the length in
    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
    integer denoted by the next 4 bytes.
 */
 const (
 	tagLiteral = 0x00
 	tagCopy1   = 0x01
 	tagCopy2   = 0x02
 	tagCopy4   = 0x03
 )
 const (
 	checksumSize     = 4
 	chunkHeaderSize  = 4
 	magicChunk       = "\xff\x06\x00\x00" + magicBody
 	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
 	magicBodySnappy  = "sNaPpY"
 	magicBody        = "S2sTwO"
 	// maxBlockSize is the maximum size of the input to encodeBlock.
 	//
 	// For the framing format (Writer type instead of Encode function),
 	// this is the maximum uncompressed size of a block.
 	maxBlockSize = 4 << 20
 	// minBlockSize is the minimum size of block setting when creating a writer.
 	minBlockSize = 4 << 10
 	skippableFrameHeader = 4
 	maxChunkSize         = 1<<24 - 1 // 16777215
 	// Default block size
 	defaultBlockSize = 1 << 20
 	// maxSnappyBlockSize is the maximum snappy block size.
 	maxSnappyBlockSize = 1 << 16
 	obufHeaderLen = checksumSize + chunkHeaderSize
 )
 const (
 	chunkTypeCompressedData   = 0x00
 	chunkTypeUncompressedData = 0x01
 	ChunkTypeIndex            = 0x99
 	chunkTypePadding          = 0xfe
 	chunkTypeStreamIdentifier = 0xff
 )
 var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // crc implements the checksum specified in section 3 of
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func crc(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
 	return c>>15 | c<<17 + 0xa282ead8
 }
 // literalExtraSize returns the extra size of encoding n literals.
 // n should be >= 0 and <= math.MaxUint32.
 func literalExtraSize(n int64) int64 {
 	if n == 0 {
 		return 0
 	}
 	switch {
 	case n < 60:
 		return 1
 	case n < 1<<8:
 		return 2
 	case n < 1<<16:
 		return 3
 	case n < 1<<24:
 		return 4
 	default:
 		return 5
 	}
 }
 type byter interface {
 	Bytes() []byte
 }
 var _ byter = &bytes.Buffer{}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -16,7 +16,7 @@ cloud.google.com/go/storage
 cloud.google.com/go/storage/internal
 cloud.google.com/go/storage/internal/apiv2
 cloud.google.com/go/storage/internal/apiv2/stubs
-# github.com/VictoriaMetrics/fastcache v1.10.0
+# github.com/VictoriaMetrics/fastcache v1.12.0
 ## explicit; go 1.13
 github.com/VictoriaMetrics/fastcache
 # github.com/VictoriaMetrics/fasthttp v1.1.0
@ -166,6 +166,7 @@ github.com/klauspost/compress/gzip
 github.com/klauspost/compress/huff0
 github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/s2
 github.com/klauspost/compress/zlib
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash