app/vmagent/remotewrite: add benchmarks for comparing the performance of standard Snappy encoder with github.com/klauspost/compress/s2 encoder

The standard Snappy encoder from github.com/golang/snappy shows quite good performance number
for compressing the Prometheus remote_write proto messages according to the added benchmarks,
so there is no need in switching to github.com/klauspost/compress/s2 yet.
This commit is contained in:
Aliaksandr Valialkin 2022-09-19 14:26:18 +03:00
parent b4410b1c63
commit 2b55d167d7
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
23 changed files with 25601 additions and 4 deletions

View file

@ -0,0 +1,62 @@
package remotewrite
import (
"fmt"
"testing"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestPushWriteRequest(t *testing.T) {
for _, rowsCount := range []int{1, 10, 100, 1e3, 1e4} {
t.Run(fmt.Sprintf("%d", rowsCount), func(t *testing.T) {
testPushWriteRequest(t, rowsCount)
})
}
}
func testPushWriteRequest(t *testing.T, rowsCount int) {
wr := newTestWriteRequest(rowsCount, 10)
pushBlockLen := 0
pushBlock := func(block []byte) {
if pushBlockLen > 0 {
panic(fmt.Errorf("BUG: pushBlock called multiple times; pushBlockLen=%d at first call, len(block)=%d at second call", pushBlockLen, len(block)))
}
pushBlockLen = len(block)
}
pushWriteRequest(wr, pushBlock)
b := prompbmarshal.MarshalWriteRequest(nil, wr)
zb := snappy.Encode(nil, b)
maxPushBlockLen := len(zb)
minPushBlockLen := maxPushBlockLen / 2
if pushBlockLen < minPushBlockLen {
t.Fatalf("unexpected block len after pushWriteRequest; got %d bytes; must be at least %d bytes", pushBlockLen, minPushBlockLen)
}
if pushBlockLen > maxPushBlockLen {
t.Fatalf("unexpected block len after pushWriteRequest; got %d bytes; must be smaller or equal to %d bytes", pushBlockLen, maxPushBlockLen)
}
}
func newTestWriteRequest(seriesCount, labelsCount int) *prompbmarshal.WriteRequest {
var wr prompbmarshal.WriteRequest
for i := 0; i < seriesCount; i++ {
var labels []prompbmarshal.Label
for j := 0; j < labelsCount; j++ {
labels = append(labels, prompbmarshal.Label{
Name: fmt.Sprintf("label_%d_%d", i, j),
Value: fmt.Sprintf("value_%d_%d", i, j),
})
}
wr.Timeseries = append(wr.Timeseries, prompbmarshal.TimeSeries{
Labels: labels,
Samples: []prompbmarshal.Sample{
{
Value: float64(i),
Timestamp: 1000*int64(i),
},
},
})
}
return &wr
}

View file

@ -0,0 +1,36 @@
package remotewrite
import (
"fmt"
"testing"
"github.com/golang/snappy"
"github.com/klauspost/compress/s2"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func BenchmarkCompressWriteRequestSnappy(b *testing.B) {
b.Run("snappy", func(b *testing.B) {
benchmarkCompressWriteRequest(b, snappy.Encode)
})
b.Run("s2", func(b *testing.B) {
benchmarkCompressWriteRequest(b, s2.EncodeSnappy)
})
}
func benchmarkCompressWriteRequest(b *testing.B, compressFunc func(dst, src []byte) []byte) {
for _, rowsCount := range []int{1, 10, 100, 1e3, 1e4} {
b.Run(fmt.Sprintf("rows_%d", rowsCount), func(b *testing.B) {
wr := newTestWriteRequest(rowsCount, 10)
data := prompbmarshal.MarshalWriteRequest(nil, wr)
b.ReportAllocs()
b.SetBytes(int64(rowsCount))
b.RunParallel(func(pb *testing.PB) {
var zb []byte
for pb.Next() {
zb = compressFunc(zb[:cap(zb)], data)
}
})
})
}
}

2
go.mod
View file

@ -4,7 +4,7 @@ go 1.19
require (
cloud.google.com/go/storage v1.26.0
github.com/VictoriaMetrics/fastcache v1.10.0
github.com/VictoriaMetrics/fastcache v1.12.0
// Do not use the original github.com/valyala/fasthttp because of issues
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b

4
go.sum
View file

@ -104,8 +104,8 @@ github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdko
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
github.com/VictoriaMetrics/fastcache v1.10.0 h1:5hDJnLsKLpnUEToub7ETuRu8RCkb40woBZAUiKonXzY=
github.com/VictoriaMetrics/fastcache v1.10.0/go.mod h1:tjiYeEfYXCqacuvYw/7UoDIeJaNxq6132xHICNP77w8=
github.com/VictoriaMetrics/fastcache v1.12.0 h1:vnVi/y9yKDcD9akmc4NqAoqgQhJrOwUF+j9LTgn4QDE=
github.com/VictoriaMetrics/fastcache v1.12.0/go.mod h1:tjiYeEfYXCqacuvYw/7UoDIeJaNxq6132xHICNP77w8=
github.com/VictoriaMetrics/fasthttp v1.1.0 h1:3crd4YWHsMwu60GUXRH6OstowiFvqrwS4a/ueoLdLL0=
github.com/VictoriaMetrics/fasthttp v1.1.0/go.mod h1:/7DMcogqd+aaD3G3Hg5kFgoFwlR2uydjiWvoLp5ZTqQ=
github.com/VictoriaMetrics/metrics v1.18.1/go.mod h1:ArjwVz7WpgpegX/JpB0zpNF2h2232kErkEnzH1sxMmA=

15
vendor/github.com/klauspost/compress/s2/.gitignore generated vendored Normal file
View file

@ -0,0 +1,15 @@
testdata/bench
# These explicitly listed benchmark data files are for an obsolete version of
# snappy_test.go.
testdata/alice29.txt
testdata/asyoulik.txt
testdata/fireworks.jpeg
testdata/geo.protodata
testdata/html
testdata/html_x_4
testdata/kppkn.gtb
testdata/lcet10.txt
testdata/paper-100k.pdf
testdata/plrabn12.txt
testdata/urls.10K

28
vendor/github.com/klauspost/compress/s2/LICENSE generated vendored Normal file
View file

@ -0,0 +1,28 @@
Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
Copyright (c) 2019 Klaus Post. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

965
vendor/github.com/klauspost/compress/s2/README.md generated vendored Normal file
View file

@ -0,0 +1,965 @@
# S2 Compression
S2 is an extension of [Snappy](https://github.com/google/snappy).
S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
This means that S2 can seamlessly replace Snappy without converting compressed content.
S2 can produce Snappy compatible output, faster and better than Snappy.
If you want full benefit of the changes you should use s2 without Snappy compatibility.
S2 is designed to have high throughput on content that cannot be compressed.
This is important, so you don't have to worry about spending CPU cycles on already compressed data.
## Benefits over Snappy
* Better compression
* Adjustable compression (3 levels)
* Concurrent stream compression
* Faster decompression, even for Snappy compatible content
* Concurrent Snappy/S2 stream decompression
* Ability to quickly skip forward in compressed stream
* Random seeking with indexes
* Compatible with reading Snappy compressed content
* Smaller block size overhead on incompressible blocks
* Block concatenation
* Uncompressed stream mode
* Automatic stream size padding
* Snappy compatible block compression
## Drawbacks over Snappy
* Not optimized for 32 bit systems
* Streams use slightly more memory due to larger blocks and concurrency (configurable)
# Usage
Installation: `go get -u github.com/klauspost/compress/s2`
Full package documentation:
[![godoc][1]][2]
[1]: https://godoc.org/github.com/klauspost/compress?status.svg
[2]: https://godoc.org/github.com/klauspost/compress/s2
## Compression
```Go
func EncodeStream(src io.Reader, dst io.Writer) error {
enc := s2.NewWriter(dst)
_, err := io.Copy(enc, src)
if err != nil {
enc.Close()
return err
}
// Blocks until compression is done.
return enc.Close()
}
```
You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
It is possible to flush any buffered data using the `Flush()` method.
This will block until all data sent to the encoder has been written to the output.
S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
a slightly more efficient method is to use the `EncodeBuffer` method.
This will take ownership of the buffer until the stream is closed.
```Go
func EncodeStream(src []byte, dst io.Writer) error {
enc := s2.NewWriter(dst)
// The encoder owns the buffer until Flush or Close is called.
err := enc.EncodeBuffer(buf)
if err != nil {
enc.Close()
return err
}
// Blocks until compression is done.
return enc.Close()
}
```
Each call to `EncodeBuffer` will result in discrete blocks being created without buffering,
so it should only be used a single time per stream.
If you need to write several blocks, you should use the regular io.Writer interface.
## Decompression
```Go
func DecodeStream(src io.Reader, dst io.Writer) error {
dec := s2.NewReader(src)
_, err := io.Copy(dst, dec)
return err
}
```
Similar to the Writer, a Reader can be reused using the `Reset` method.
For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.
For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
Do however note that these functions (similar to Snappy) does not provide validation of data,
so data corruption may be undetected. Stream encoding provides CRC checks of data.
It is possible to efficiently skip forward in a compressed stream using the `Skip()` method.
For big skips the decompressor is able to skip blocks without decompressing them.
## Single Blocks
Similar to Snappy S2 offers single block compression.
Blocks do not offer the same flexibility and safety as streams,
but may be preferable for very small payloads, less than 100K.
Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result.
It is possible to provide a destination buffer.
If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used.
If not a new will be allocated.
Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`.
Again an optional destination buffer can be supplied.
The `s2.DecodedLen(src)` can be used to get the minimum capacity needed.
If that is not satisfied a new buffer will be allocated.
Block function always operate on a single goroutine since it should only be used for small payloads.
# Commandline tools
Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
Installing then requires Go to be installed. To install them, use:
`go install github.com/klauspost/compress/s2/cmd/s2c@latest && go install github.com/klauspost/compress/s2/cmd/s2d@latest`
To build binaries to the current folder use:
`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
## s2c
```
Usage: s2c [options] file1 file2
Compresses all files supplied as input separately.
Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.
Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
File names beginning with 'http://' and 'https://' will be downloaded and compressed.
Only http response code 200 is accepted.
Options:
-bench int
Run benchmark n times. No output will be written
-blocksize string
Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
-c Write all output to stdout. Multiple input files will be concatenated
-cpu int
Compress using this amount of threads (default 32)
-faster
Compress faster, but with a minor compression loss
-help
Display help
-index
Add seek index (default true)
-o string
Write output to another file. Single input file only
-pad string
Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
-q Don't write any output to terminal, except errors
-rm
Delete source file(s) after successful compression
-safe
Do not overwrite output files
-slower
Compress more, but a lot slower
-snappy
Generate Snappy compatible output stream
-verify
Verify written files
```
## s2d
```
Usage: s2d [options] file1 file2
Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
Output file names have the extension removed. By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.
Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
Extensions on downloaded files are ignored. Only http response code 200 is accepted.
Options:
-bench int
Run benchmark n times. No output will be written
-c Write all output to stdout. Multiple input files will be concatenated
-help
Display help
-o string
Write output to another file. Single input file only
-offset string
Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
-q Don't write any output to terminal, except errors
-rm
Delete source file(s) after successful decompression
-safe
Do not overwrite output files
-tail string
Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
-verify
Verify files, but do not write output
```
## s2sx: self-extracting archives
s2sx allows creating self-extracting archives with no dependencies.
By default, executables are created for the same platforms as the host os,
but this can be overridden with `-os` and `-arch` parameters.
Extracted files have 0666 permissions, except when untar option used.
```
Usage: s2sx [options] file1 file2
Compresses all files supplied as input separately.
If files have '.s2' extension they are assumed to be compressed already.
Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
If output is big, an additional file with ".more" is written. This must be included as well.
By default output files will be overwritten.
Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
Options:
-arch string
Destination architecture (default "amd64")
-c Write all output to stdout. Multiple input files will be concatenated
-cpu int
Compress using this amount of threads (default 32)
-help
Display help
-max string
Maximum executable size. Rest will be written to another file. (default "1G")
-os string
Destination operating system (default "windows")
-q Don't write any output to terminal, except errors
-rm
Delete source file(s) after successful compression
-safe
Do not overwrite output files
-untar
Untar on destination
```
Available platforms are:
* darwin-amd64
* darwin-arm64
* linux-amd64
* linux-arm
* linux-arm64
* linux-mips64
* linux-ppc64le
* windows-386
* windows-amd64
By default, there is a size limit of 1GB for the output executable.
When this is exceeded the remaining file content is written to a file called
output+`.more`. This file must be included for a successful extraction and
placed alongside the executable for a successful extraction.
This file *must* have the same name as the executable, so if the executable is renamed,
so must the `.more` file.
This functionality is disabled with stdin/stdout.
### Self-extracting TAR files
If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
Files are extracted to the current folder with the path specified in the tar file.
Note that tar files are not validated before they are wrapped.
For security reasons files that move below the root folder are not allowed.
# Performance
This section will focus on comparisons to Snappy.
This package is solely aimed at replacing Snappy as a high speed compression package.
If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
gives better compression, but typically at speeds slightly below "better" mode in this package.
Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
The content compressed in this mode is fully compatible with the standard decoder.
Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% |
| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - |
| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% |
| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - |
| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% |
| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - |
| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% |
| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - |
| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% |
| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - |
| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% |
| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - |
| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% |
| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - |
| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% |
| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - |
| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% |
| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - |
| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% |
| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - |
| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% |
| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - |
### Legend
* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
* `S2 throughput`: Throughput of S2 in MB/s.
* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.
* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.
* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup.
This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above).
## Decompression
S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
S2 vs Snappy **decompression** speed. Both operating on single core:
| File | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 2117 MB/s | 1.14x | 1738 MB/s | 0.94x |
| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s | 1.25x | 2307 MB/s | 1.20x |
| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 2075 MB/s | 0.98x | 1764 MB/s | 0.83x |
| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 2967 MB/s | 1.05x | 2885 MB/s | 1.02x |
| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 4141 MB/s | 1.07x | 4184 MB/s | 1.08x |
| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 2264 MB/s | 1.12x | 2185 MB/s | 1.08x |
| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 1525 MB/s | 1.03x | 1347 MB/s | 0.91x |
| sharnd.out.2gb | 3813 MB/s | 0.79x | 3900 MB/s | 0.81x |
| [enwik9](http://mattmahoney.net/dc/textdata.html) | 1246 MB/s | 1.29x | 967 MB/s | 1.00x |
| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 1433 MB/s | 1.12x | 1203 MB/s | 0.94x |
| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 1284 MB/s | 1.32x | 1010 MB/s | 1.04x |
### Legend
* `S2 Throughput`: Decompression speed of S2 encoded content.
* `Better Throughput`: Decompression speed of S2 "better" encoded content.
* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
While the decompression code hasn't changed, there is a significant speedup in decompression speed.
S2 prefers longer matches and will typically only find matches that are 6 bytes or longer.
While this reduces compression a bit, it improves decompression speed.
The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.
Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
| File | S2 Throughput | S2 throughput |
|--------------------------------|--------------|---------------|
| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
| enwik9.s2 | 1.67x | 681.53 MB/s |
| adresser.json.s2 | 3.41x | 4230.53 MB/s |
| silesia.tar.s2 | 1.52x | 811.58 |
Even though S2 typically compresses better than Snappy, decompression speed is always better.
### Concurrent Stream Decompression
For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent)
that will decode a full stream using multiple goroutines.
Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3:
| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` |
|-------------------------------------------|------------|------------|------------|------------|-------------|
| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s |
| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s |
Scaling can be expected to be pretty linear until memory bandwidth is saturated.
For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
## Block compression
When compressing blocks no concurrent compression is performed just as Snappy.
This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
In rare, worst case scenario Snappy blocks could be significantly bigger than the input.
### Mixed content blocks
The most reliable is a wide dataset.
For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
| * | Input | Output | Reduction | MB/s |
|-------------------|------------|------------|-----------|--------|
| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** |
| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 |
| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 |
| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 |
| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 |
| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 |
S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
"Better" mode provides the same compression speed as LZ4 with better compression ratio.
When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
other Go compressors:
| * | Input | Output | Reduction | MB/s |
|-------------------|------------|------------|-----------|--------|
| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 |
| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 |
| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 |
| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 |
### Standard block compression
Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
So individual benchmarks should only be seen as a guideline and the overall picture is more important.
These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above.
Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
AMD64 assembly is use for both S2 and Snappy.
| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec |
|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s |
| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s |
| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s |
| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s |
| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s |
| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s |
| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s |
| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s |
| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s |
| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s |
| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s |
| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s |
| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s |
| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s |
| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s |
| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s |
| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
|-----------------------|-------------|------------------|----------|--------------|
| html | 22.31% | 7.58% | 1.07x | 1.20x |
| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x |
| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x |
| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x |
| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x |
| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x |
| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x |
| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x |
| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x |
| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x |
| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x |
| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x |
| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x |
| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x |
| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x |
| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x |
Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.
Decompression speed is better than Snappy, except in one case.
Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
Size is on average around Snappy, but varies on content type.
In cases where compression is worse, it usually is compensated by a speed boost.
### Better compression
Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
So individual benchmarks should only be seen as a guideline and the overall picture is more important.
| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec |
|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s |
| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s |
| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s |
| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s |
| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s |
| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s |
| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s |
| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s |
| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s |
| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s |
| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s |
| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s |
| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s |
| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s |
| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s |
| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s |
| Relative Perf | Snappy size | Better size | Better Speed | Better dec |
|-----------------------|-------------|-------------|--------------|------------|
| html | 22.31% | 13.18% | 0.48x | 0.98x |
| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x |
| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x |
| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x |
| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x |
| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x |
| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x |
| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x |
| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x |
| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x |
| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x |
| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x |
| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x |
| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x |
| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x |
| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x |
Except for the mostly incompressible JPEG image compression is better and usually in the
double digits in terms of percentage reduction over Snappy.
The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder
to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
This mode aims to provide better compression at the expense of performance and achieves that
without a huge performance penalty, except on very small blocks.
Decompression speed suffers a little compared to the regular S2 mode,
but still manages to be close to Snappy in spite of increased compression.
# Best compression mode
S2 offers a "best" compression mode.
This will compress as much as possible with little regard to CPU usage.
Mainly for offline compression, but where decompression speed should still
be high and compatible with other S2 compressed data.
Some examples compared on 16 core CPU, amd64 assembly used:
```
* enwik10
Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
* github-june-2days-2019.json
Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
* nyc-taxi-data-10M.csv
Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
* 10gb.tar
Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
* consensus.db.10gb
Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
```
Decompression speed should be around the same as using the 'better' compression mode.
# Snappy Compatibility
S2 now offers full compatibility with Snappy.
This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
This uses "better" mode for all operations.
If you would like more control, you can use the s2 package as described below:
## Blocks
Snappy compatible blocks can be generated with the S2 encoder.
Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace
| Snappy | S2 replacement |
|----------------------------|-------------------------|
| snappy.Encode(...) | s2.EncodeSnappy(...) |
| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.
`s2.ConcatBlocks` is compatible with snappy blocks.
Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
| Encoder | Size | MB/s | Reduction |
|-----------------------|------------|------------|------------
| snappy.Encode | 1128706759 | 725.59 | 71.89% |
| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%**|
## Streams
For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
All other options are available, but note that block size limit is different for snappy.
Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput:
| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best |
|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
| nyc-taxi-data-10M.csv | 1316042016 - 539.47MB/s | 1307003093 - 10132.73MB/s | 1174534014 - 5002.44MB/s | 1115904679 - 177.97MB/s |
| enwik10 (xml) | 5088294643 - 451.13MB/s | 5175840939 - 9440.69MB/s | 4560784526 - 4487.21MB/s | 4340299103 - 158.92MB/s |
| 10gb.tar (mixed) | 6056946612 - 729.73MB/s | 6208571995 - 9978.05MB/s | 5741646126 - 4919.98MB/s | 5548973895 - 180.44MB/s |
| github-june-2days-2019.json | 1525176492 - 933.00MB/s | 1476519054 - 13150.12MB/s | 1400547532 - 5803.40MB/s | 1321887137 - 204.29MB/s |
| consensus.db.10gb (db) | 5412897703 - 1102.14MB/s | 5354073487 - 13562.91MB/s | 5335069899 - 5294.73MB/s | 5201000954 - 175.72MB/s |
# Decompression
All decompression functions map directly to equivalent s2 functions.
| Snappy | S2 replacement |
|------------------------|--------------------|
| snappy.Decode(...) | s2.Decode(...) |
| snappy.DecodedLen(...) | s2.DecodedLen(...) |
| snappy.NewReader(...) | s2.NewReader(...) |
Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
are also available for Snappy streams.
If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
on your Reader will reduce memory consumption.
# Concatenating blocks and streams.
Concatenating streams will concatenate the output of both without recompressing them.
While this is inefficient in terms of compression it might be usable in certain scenarios.
The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
Blocks can be concatenated using the `ConcatBlocks` function.
Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
Streams with indexes (see below) will currently not work on concatenated streams.
# Stream Seek Index
S2 and Snappy streams can have indexes. These indexes will allow random seeking within the compressed data.
The index can either be appended to the stream as a skippable block or returned for separate storage.
When the index is appended to a stream it will be skipped by regular decoders,
so the output remains compatible with other decoders.
## Creating an Index
To automatically add an index to a stream, add `WriterAddIndex()` option to your writer.
Then the index will be added to the stream when `Close()` is called.
```
// Add Index to stream...
enc := s2.NewWriter(w, s2.WriterAddIndex())
io.Copy(enc, r)
enc.Close()
```
If you want to store the index separately, you can use `CloseIndex()` instead of the regular `Close()`.
This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`.
```
// Get index for separate storage...
enc := s2.NewWriter(w)
io.Copy(enc, r)
index, err := enc.CloseIndex()
```
The `index` can then be used needing to read from the stream.
This means the index can be used without needing to seek to the end of the stream
or for manually forwarding streams. See below.
Finally, an existing S2/Snappy stream can be indexed using the `s2.IndexStream(r io.Reader)` function.
## Using Indexes
To use indexes there is a `ReadSeeker(random bool, index []byte) (*ReadSeeker, error)` function available.
Calling ReadSeeker will return an [io.ReadSeeker](https://pkg.go.dev/io#ReadSeeker) compatible version of the reader.
If 'random' is specified the returned io.Seeker can be used for random seeking, otherwise only forward seeking is supported.
Enabling random seeking requires the original input to support the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
```
dec := s2.NewReader(r)
rs, err := dec.ReadSeeker(false, nil)
rs.Seek(wantOffset, io.SeekStart)
```
Get a seeker to seek forward. Since no index is provided, the index is read from the stream.
This requires that an index was added and that `r` supports the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
A custom index can be specified which will be used if supplied.
When using a custom index, it will not be read from the input stream.
```
dec := s2.NewReader(r)
rs, err := dec.ReadSeeker(false, index)
rs.Seek(wantOffset, io.SeekStart)
```
This will read the index from `index`. Since we specify non-random (forward only) seeking `r` does not have to be an io.Seeker
```
dec := s2.NewReader(r)
rs, err := dec.ReadSeeker(true, index)
rs.Seek(wantOffset, io.SeekStart)
```
Finally, since we specify that we want to do random seeking `r` must be an io.Seeker.
The returned [ReadSeeker](https://pkg.go.dev/github.com/klauspost/compress/s2#ReadSeeker) contains a shallow reference to the existing Reader,
meaning changes performed to one is reflected in the other.
To check if a stream contains an index at the end, the `(*Index).LoadStream(rs io.ReadSeeker) error` can be used.
## Manually Forwarding Streams
Indexes can also be read outside the decoder using the [Index](https://pkg.go.dev/github.com/klauspost/compress/s2#Index) type.
This can be used for parsing indexes, either separate or in streams.
In some cases it may not be possible to serve a seekable stream.
This can for instance be an HTTP stream, where the Range request
is sent at the start of the stream.
With a little bit of extra code it is still possible to use indexes
to forward to specific offset with a single forward skip.
It is possible to load the index manually like this:
```
var index s2.Index
_, err = index.Load(idxBytes)
```
This can be used to figure out how much to offset the compressed stream:
```
compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
```
The `compressedOffset` is the number of bytes that should be skipped
from the beginning of the compressed file.
The `uncompressedOffset` will then be offset of the uncompressed bytes returned
when decoding from that position. This will always be <= wantOffset.
When creating a decoder it must be specified that it should *not* expect a stream identifier
at the beginning of the stream. Assuming the io.Reader `r` has been forwarded to `compressedOffset`
we create the decoder like this:
```
dec := s2.NewReader(r, s2.ReaderIgnoreStreamIdentifier())
```
We are not completely done. We still need to forward the stream the uncompressed bytes we didn't want.
This is done using the regular "Skip" function:
```
err = dec.Skip(wantOffset - uncompressedOffset)
```
This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
## Index Format:
Each block is structured as a snappy skippable block, with the chunk ID 0x99.
The block can be read from the front, but contains information so it can be read from the back as well.
Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),
with un-encoded value length of 64 bits, unless other limits are specified.
| Content | Format |
|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
| ID, `[1]byte` | Always 0x99. |
| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
| UncompressedSize, Varint | Total Uncompressed size. |
| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
For regular streams the uncompressed offsets are fully predictable,
so `HasUncompressedOffsets` allows to specify that compressed blocks all have
exactly `EstBlockSize` bytes of uncompressed content.
Entries *must* be in order, starting with the lowest offset,
and there *must* be no uncompressed offset duplicates.
Entries *may* point to the start of a skippable block,
but it is then not allowed to also have an entry for the next block since
that would give an uncompressed offset duplicate.
There is no requirement for all blocks to be represented in the index.
In fact there is a maximum of 65536 block entries in an index.
The writer can use any method to reduce the number of entries.
An implicit block start at 0,0 can be assumed.
### Decoding entries:
```
// Read Uncompressed entries.
// Each assumes EstBlockSize delta from previous.
for each entry {
uOff = 0
if HasUncompressedOffsets == 1 {
uOff = ReadVarInt // Read value from stream
}
// Except for the first entry, use previous values.
if entryNum == 0 {
entry[entryNum].UncompressedOffset = uOff
continue
}
// Uncompressed uses previous offset and adds EstBlockSize
entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
}
// Guess that the first block will be 50% of uncompressed size.
// Integer truncating division must be used.
CompressGuess := EstBlockSize / 2
// Read Compressed entries.
// Each assumes CompressGuess delta from previous.
// CompressGuess is adjusted for each value.
for each entry {
cOff = ReadVarInt // Read value from stream
// Except for the first entry, use previous values.
if entryNum == 0 {
entry[entryNum].CompressedOffset = cOff
continue
}
// Compressed uses previous and our estimate.
entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
// Adjust compressed offset for next loop, integer truncating division must be used.
CompressGuess += cOff/2
}
```
To decode from any given uncompressed offset `(wantOffset)`:
* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
* Start decoding from `entry[n-1].CompressedOffset`.
* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
# Format Extensions
* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
The length is specified by reading the 3-bit length specified in the tag and decode using this table:
| Length | Actual Length |
|--------|----------------------|
| 0 | 4 |
| 1 | 5 |
| 2 | 6 |
| 3 | 7 |
| 4 | 8 |
| 5 | 8 + read 1 byte |
| 6 | 260 + read 2 bytes |
| 7 | 65540 + read 3 bytes |
This allows any repeat offset + length to be represented by 2 to 5 bytes.
Lengths are stored as little endian values.
The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
Default streaming block size is 1MB.
# LICENSE
This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.

1046
vendor/github.com/klauspost/compress/s2/decode.go generated vendored Normal file

File diff suppressed because it is too large Load diff

568
vendor/github.com/klauspost/compress/s2/decode_amd64.s generated vendored Normal file
View file

@ -0,0 +1,568 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#define R_TMP0 AX
#define R_TMP1 BX
#define R_LEN CX
#define R_OFF DX
#define R_SRC SI
#define R_DST DI
#define R_DBASE R8
#define R_DLEN R9
#define R_DEND R10
#define R_SBASE R11
#define R_SLEN R12
#define R_SEND R13
#define R_TMP2 R14
#define R_TMP3 R15
// The asm code generally follows the pure Go code in decode_other.go, except
// where marked with a "!!!".
// func decode(dst, src []byte) int
//
// All local variables fit into registers. The non-zero stack size is only to
// spill registers and push args when issuing a CALL. The register allocation:
// - R_TMP0 scratch
// - R_TMP1 scratch
// - R_LEN length or x (shared)
// - R_OFF offset
// - R_SRC &src[s]
// - R_DST &dst[d]
// + R_DBASE dst_base
// + R_DLEN dst_len
// + R_DEND dst_base + dst_len
// + R_SBASE src_base
// + R_SLEN src_len
// + R_SEND src_base + src_len
// - R_TMP2 used by doCopy
// - R_TMP3 used by doCopy
//
// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
// function, and after a CALL returns, and are not otherwise modified.
//
// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
TEXT ·s2Decode(SB), NOSPLIT, $48-56
// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
MOVQ dst_base+0(FP), R_DBASE
MOVQ dst_len+8(FP), R_DLEN
MOVQ R_DBASE, R_DST
MOVQ R_DBASE, R_DEND
ADDQ R_DLEN, R_DEND
MOVQ src_base+24(FP), R_SBASE
MOVQ src_len+32(FP), R_SLEN
MOVQ R_SBASE, R_SRC
MOVQ R_SBASE, R_SEND
ADDQ R_SLEN, R_SEND
XORQ R_OFF, R_OFF
loop:
// for s < len(src)
CMPQ R_SRC, R_SEND
JEQ end
// R_LEN = uint32(src[s])
//
// switch src[s] & 0x03
MOVBLZX (R_SRC), R_LEN
MOVL R_LEN, R_TMP1
ANDL $3, R_TMP1
CMPL R_TMP1, $1
JAE tagCopy
// ----------------------------------------
// The code below handles literal tags.
// case tagLiteral:
// x := uint32(src[s] >> 2)
// switch
SHRL $2, R_LEN
CMPL R_LEN, $60
JAE tagLit60Plus
// case x < 60:
// s++
INCQ R_SRC
doLit:
// This is the end of the inner "switch", when we have a literal tag.
//
// We assume that R_LEN == x and x fits in a uint32, where x is the variable
// used in the pure Go decode_other.go code.
// length = int(x) + 1
//
// Unlike the pure Go code, we don't need to check if length <= 0 because
// R_LEN can hold 64 bits, so the increment cannot overflow.
INCQ R_LEN
// Prepare to check if copying length bytes will run past the end of dst or
// src.
//
// R_TMP0 = len(dst) - d
// R_TMP1 = len(src) - s
MOVQ R_DEND, R_TMP0
SUBQ R_DST, R_TMP0
MOVQ R_SEND, R_TMP1
SUBQ R_SRC, R_TMP1
// !!! Try a faster technique for short (16 or fewer bytes) copies.
//
// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
// goto callMemmove // Fall back on calling runtime·memmove.
// }
//
// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
// against 21 instead of 16, because it cannot assume that all of its input
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
CMPQ R_LEN, $16
JGT callMemmove
CMPQ R_TMP0, $16
JLT callMemmove
CMPQ R_TMP1, $16
JLT callMemmove
// !!! Implement the copy from src to dst as a 16-byte load and store.
// (Decode's documentation says that dst and src must not overlap.)
//
// This always copies 16 bytes, instead of only length bytes, but that's
// OK. If the input is a valid Snappy encoding then subsequent iterations
// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
// non-nil error), so the overrun will be ignored.
//
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
MOVOU 0(R_SRC), X0
MOVOU X0, 0(R_DST)
// d += length
// s += length
ADDQ R_LEN, R_DST
ADDQ R_LEN, R_SRC
JMP loop
callMemmove:
// if length > len(dst)-d || length > len(src)-s { etc }
CMPQ R_LEN, R_TMP0
JGT errCorrupt
CMPQ R_LEN, R_TMP1
JGT errCorrupt
// copy(dst[d:], src[s:s+length])
//
// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
// three registers to the stack, to save local variables across the CALL.
MOVQ R_DST, 0(SP)
MOVQ R_SRC, 8(SP)
MOVQ R_LEN, 16(SP)
MOVQ R_DST, 24(SP)
MOVQ R_SRC, 32(SP)
MOVQ R_LEN, 40(SP)
MOVQ R_OFF, 48(SP)
CALL runtime·memmove(SB)
// Restore local variables: unspill registers from the stack and
// re-calculate R_DBASE-R_SEND.
MOVQ 24(SP), R_DST
MOVQ 32(SP), R_SRC
MOVQ 40(SP), R_LEN
MOVQ 48(SP), R_OFF
MOVQ dst_base+0(FP), R_DBASE
MOVQ dst_len+8(FP), R_DLEN
MOVQ R_DBASE, R_DEND
ADDQ R_DLEN, R_DEND
MOVQ src_base+24(FP), R_SBASE
MOVQ src_len+32(FP), R_SLEN
MOVQ R_SBASE, R_SEND
ADDQ R_SLEN, R_SEND
// d += length
// s += length
ADDQ R_LEN, R_DST
ADDQ R_LEN, R_SRC
JMP loop
tagLit60Plus:
// !!! This fragment does the
//
// s += x - 58; if uint(s) > uint(len(src)) { etc }
//
// checks. In the asm version, we code it once instead of once per switch case.
ADDQ R_LEN, R_SRC
SUBQ $58, R_SRC
CMPQ R_SRC, R_SEND
JA errCorrupt
// case x == 60:
CMPL R_LEN, $61
JEQ tagLit61
JA tagLit62Plus
// x = uint32(src[s-1])
MOVBLZX -1(R_SRC), R_LEN
JMP doLit
tagLit61:
// case x == 61:
// x = uint32(src[s-2]) | uint32(src[s-1])<<8
MOVWLZX -2(R_SRC), R_LEN
JMP doLit
tagLit62Plus:
CMPL R_LEN, $62
JA tagLit63
// case x == 62:
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
// We read one byte, safe to read one back, since we are just reading tag.
// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
MOVL -4(R_SRC), R_LEN
SHRL $8, R_LEN
JMP doLit
tagLit63:
// case x == 63:
// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
MOVL -4(R_SRC), R_LEN
JMP doLit
// The code above handles literal tags.
// ----------------------------------------
// The code below handles copy tags.
tagCopy4:
// case tagCopy4:
// s += 5
ADDQ $5, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// length = 1 + int(src[s-5])>>2
SHRQ $2, R_LEN
INCQ R_LEN
// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
MOVLQZX -4(R_SRC), R_OFF
JMP doCopy
tagCopy2:
// case tagCopy2:
// s += 3
ADDQ $3, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// length = 1 + int(src[s-3])>>2
SHRQ $2, R_LEN
INCQ R_LEN
// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
MOVWQZX -2(R_SRC), R_OFF
JMP doCopy
tagCopy:
// We have a copy tag. We assume that:
// - R_TMP1 == src[s] & 0x03
// - R_LEN == src[s]
CMPQ R_TMP1, $2
JEQ tagCopy2
JA tagCopy4
// case tagCopy1:
// s += 2
ADDQ $2, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
// length = 4 + int(src[s-2])>>2&0x7
MOVBQZX -1(R_SRC), R_TMP1
MOVQ R_LEN, R_TMP0
SHRQ $2, R_LEN
ANDQ $0xe0, R_TMP0
ANDQ $7, R_LEN
SHLQ $3, R_TMP0
ADDQ $4, R_LEN
ORQ R_TMP1, R_TMP0
// check if repeat code, ZF set by ORQ.
JZ repeatCode
// This is a regular copy, transfer our temporary value to R_OFF (length)
MOVQ R_TMP0, R_OFF
JMP doCopy
// This is a repeat code.
repeatCode:
// If length < 9, reuse last offset, with the length already calculated.
CMPQ R_LEN, $9
JL doCopyRepeat
// Read additional bytes for length.
JE repeatLen1
// Rare, so the extra branch shouldn't hurt too much.
CMPQ R_LEN, $10
JE repeatLen2
JMP repeatLen3
// Read repeat lengths.
repeatLen1:
// s ++
ADDQ $1, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// length = src[s-1] + 8
MOVBQZX -1(R_SRC), R_LEN
ADDL $8, R_LEN
JMP doCopyRepeat
repeatLen2:
// s +=2
ADDQ $2, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
MOVWQZX -2(R_SRC), R_LEN
ADDL $260, R_LEN
JMP doCopyRepeat
repeatLen3:
// s +=3
ADDQ $3, R_SRC
// if uint(s) > uint(len(src)) { etc }
CMPQ R_SRC, R_SEND
JA errCorrupt
// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
// Read one byte further back (just part of the tag, shifted out)
MOVL -4(R_SRC), R_LEN
SHRL $8, R_LEN
ADDL $65540, R_LEN
JMP doCopyRepeat
doCopy:
// This is the end of the outer "switch", when we have a copy tag.
//
// We assume that:
// - R_LEN == length && R_LEN > 0
// - R_OFF == offset
// if d < offset { etc }
MOVQ R_DST, R_TMP1
SUBQ R_DBASE, R_TMP1
CMPQ R_TMP1, R_OFF
JLT errCorrupt
// Repeat values can skip the test above, since any offset > 0 will be in dst.
doCopyRepeat:
// if offset <= 0 { etc }
CMPQ R_OFF, $0
JLE errCorrupt
// if length > len(dst)-d { etc }
MOVQ R_DEND, R_TMP1
SUBQ R_DST, R_TMP1
CMPQ R_LEN, R_TMP1
JGT errCorrupt
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
//
// Set:
// - R_TMP2 = len(dst)-d
// - R_TMP3 = &dst[d-offset]
MOVQ R_DEND, R_TMP2
SUBQ R_DST, R_TMP2
MOVQ R_DST, R_TMP3
SUBQ R_OFF, R_TMP3
// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
//
// First, try using two 8-byte load/stores, similar to the doLit technique
// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
// and not one 16-byte load/store, and the first store has to be before the
// second load, due to the overlap if offset is in the range [8, 16).
//
// if length > 16 || offset < 8 || len(dst)-d < 16 {
// goto slowForwardCopy
// }
// copy 16 bytes
// d += length
CMPQ R_LEN, $16
JGT slowForwardCopy
CMPQ R_OFF, $8
JLT slowForwardCopy
CMPQ R_TMP2, $16
JLT slowForwardCopy
MOVQ 0(R_TMP3), R_TMP0
MOVQ R_TMP0, 0(R_DST)
MOVQ 8(R_TMP3), R_TMP1
MOVQ R_TMP1, 8(R_DST)
ADDQ R_LEN, R_DST
JMP loop
slowForwardCopy:
// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
// can still try 8-byte load stores, provided we can overrun up to 10 extra
// bytes. As above, the overrun will be fixed up by subsequent iterations
// of the outermost loop.
//
// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
// commentary says:
//
// ----
//
// The main part of this loop is a simple copy of eight bytes at a time
// until we've copied (at least) the requested amount of bytes. However,
// if d and d-offset are less than eight bytes apart (indicating a
// repeating pattern of length < 8), we first need to expand the pattern in
// order to get the correct results. For instance, if the buffer looks like
// this, with the eight-byte <d-offset> and <d> patterns marked as
// intervals:
//
// abxxxxxxxxxxxx
// [------] d-offset
// [------] d
//
// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
// once, after which we can move <d> two bytes without moving <d-offset>:
//
// ababxxxxxxxxxx
// [------] d-offset
// [------] d
//
// and repeat the exercise until the two no longer overlap.
//
// This allows us to do very well in the special case of one single byte
// repeated many times, without taking a big hit for more general cases.
//
// The worst case of extra writing past the end of the match occurs when
// offset == 1 and length == 1; the last copy will read from byte positions
// [0..7] and write to [4..11], whereas it was only supposed to write to
// position 1. Thus, ten excess bytes.
//
// ----
//
// That "10 byte overrun" worst case is confirmed by Go's
// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
// and finishSlowForwardCopy algorithm.
//
// if length > len(dst)-d-10 {
// goto verySlowForwardCopy
// }
SUBQ $10, R_TMP2
CMPQ R_LEN, R_TMP2
JGT verySlowForwardCopy
// We want to keep the offset, so we use R_TMP2 from here.
MOVQ R_OFF, R_TMP2
makeOffsetAtLeast8:
// !!! As above, expand the pattern so that offset >= 8 and we can use
// 8-byte load/stores.
//
// for offset < 8 {
// copy 8 bytes from dst[d-offset:] to dst[d:]
// length -= offset
// d += offset
// offset += offset
// // The two previous lines together means that d-offset, and therefore
// // R_TMP3, is unchanged.
// }
CMPQ R_TMP2, $8
JGE fixUpSlowForwardCopy
MOVQ (R_TMP3), R_TMP1
MOVQ R_TMP1, (R_DST)
SUBQ R_TMP2, R_LEN
ADDQ R_TMP2, R_DST
ADDQ R_TMP2, R_TMP2
JMP makeOffsetAtLeast8
fixUpSlowForwardCopy:
// !!! Add length (which might be negative now) to d (implied by R_DST being
// &dst[d]) so that d ends up at the right place when we jump back to the
// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
// length is positive, copying the remaining length bytes will write to the
// right place.
MOVQ R_DST, R_TMP0
ADDQ R_LEN, R_DST
finishSlowForwardCopy:
// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
// length means that we overrun, but as above, that will be fixed up by
// subsequent iterations of the outermost loop.
CMPQ R_LEN, $0
JLE loop
MOVQ (R_TMP3), R_TMP1
MOVQ R_TMP1, (R_TMP0)
ADDQ $8, R_TMP3
ADDQ $8, R_TMP0
SUBQ $8, R_LEN
JMP finishSlowForwardCopy
verySlowForwardCopy:
// verySlowForwardCopy is a simple implementation of forward copy. In C
// parlance, this is a do/while loop instead of a while loop, since we know
// that length > 0. In Go syntax:
//
// for {
// dst[d] = dst[d - offset]
// d++
// length--
// if length == 0 {
// break
// }
// }
MOVB (R_TMP3), R_TMP1
MOVB R_TMP1, (R_DST)
INCQ R_TMP3
INCQ R_DST
DECQ R_LEN
JNZ verySlowForwardCopy
JMP loop
// The code above handles copy tags.
// ----------------------------------------
end:
// This is the end of the "for s < len(src)".
//
// if d != len(dst) { etc }
CMPQ R_DST, R_DEND
JNE errCorrupt
// return 0
MOVQ $0, ret+48(FP)
RET
errCorrupt:
// return decodeErrCodeCorrupt
MOVQ $1, ret+48(FP)
RET

574
vendor/github.com/klauspost/compress/s2/decode_arm64.s generated vendored Normal file
View file

@ -0,0 +1,574 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#define R_TMP0 R2
#define R_TMP1 R3
#define R_LEN R4
#define R_OFF R5
#define R_SRC R6
#define R_DST R7
#define R_DBASE R8
#define R_DLEN R9
#define R_DEND R10
#define R_SBASE R11
#define R_SLEN R12
#define R_SEND R13
#define R_TMP2 R14
#define R_TMP3 R15
// TEST_SRC will check if R_SRC is <= SRC_END
#define TEST_SRC() \
CMP R_SEND, R_SRC \
BGT errCorrupt
// MOVD R_SRC, R_TMP1
// SUB R_SBASE, R_TMP1, R_TMP1
// CMP R_SLEN, R_TMP1
// BGT errCorrupt
// The asm code generally follows the pure Go code in decode_other.go, except
// where marked with a "!!!".
// func decode(dst, src []byte) int
//
// All local variables fit into registers. The non-zero stack size is only to
// spill registers and push args when issuing a CALL. The register allocation:
// - R_TMP0 scratch
// - R_TMP1 scratch
// - R_LEN length or x
// - R_OFF offset
// - R_SRC &src[s]
// - R_DST &dst[d]
// + R_DBASE dst_base
// + R_DLEN dst_len
// + R_DEND dst_base + dst_len
// + R_SBASE src_base
// + R_SLEN src_len
// + R_SEND src_base + src_len
// - R_TMP2 used by doCopy
// - R_TMP3 used by doCopy
//
// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
// function, and after a CALL returns, and are not otherwise modified.
//
// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
TEXT ·s2Decode(SB), NOSPLIT, $56-64
// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
MOVD dst_base+0(FP), R_DBASE
MOVD dst_len+8(FP), R_DLEN
MOVD R_DBASE, R_DST
MOVD R_DBASE, R_DEND
ADD R_DLEN, R_DEND, R_DEND
MOVD src_base+24(FP), R_SBASE
MOVD src_len+32(FP), R_SLEN
MOVD R_SBASE, R_SRC
MOVD R_SBASE, R_SEND
ADD R_SLEN, R_SEND, R_SEND
MOVD $0, R_OFF
loop:
// for s < len(src)
CMP R_SEND, R_SRC
BEQ end
// R_LEN = uint32(src[s])
//
// switch src[s] & 0x03
MOVBU (R_SRC), R_LEN
MOVW R_LEN, R_TMP1
ANDW $3, R_TMP1
MOVW $1, R1
CMPW R1, R_TMP1
BGE tagCopy
// ----------------------------------------
// The code below handles literal tags.
// case tagLiteral:
// x := uint32(src[s] >> 2)
// switch
MOVW $60, R1
LSRW $2, R_LEN, R_LEN
CMPW R_LEN, R1
BLS tagLit60Plus
// case x < 60:
// s++
ADD $1, R_SRC, R_SRC
doLit:
// This is the end of the inner "switch", when we have a literal tag.
//
// We assume that R_LEN == x and x fits in a uint32, where x is the variable
// used in the pure Go decode_other.go code.
// length = int(x) + 1
//
// Unlike the pure Go code, we don't need to check if length <= 0 because
// R_LEN can hold 64 bits, so the increment cannot overflow.
ADD $1, R_LEN, R_LEN
// Prepare to check if copying length bytes will run past the end of dst or
// src.
//
// R_TMP0 = len(dst) - d
// R_TMP1 = len(src) - s
MOVD R_DEND, R_TMP0
SUB R_DST, R_TMP0, R_TMP0
MOVD R_SEND, R_TMP1
SUB R_SRC, R_TMP1, R_TMP1
// !!! Try a faster technique for short (16 or fewer bytes) copies.
//
// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
// goto callMemmove // Fall back on calling runtime·memmove.
// }
//
// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
// against 21 instead of 16, because it cannot assume that all of its input
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
CMP $16, R_LEN
BGT callMemmove
CMP $16, R_TMP0
BLT callMemmove
CMP $16, R_TMP1
BLT callMemmove
// !!! Implement the copy from src to dst as a 16-byte load and store.
// (Decode's documentation says that dst and src must not overlap.)
//
// This always copies 16 bytes, instead of only length bytes, but that's
// OK. If the input is a valid Snappy encoding then subsequent iterations
// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
// non-nil error), so the overrun will be ignored.
//
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
LDP 0(R_SRC), (R_TMP2, R_TMP3)
STP (R_TMP2, R_TMP3), 0(R_DST)
// d += length
// s += length
ADD R_LEN, R_DST, R_DST
ADD R_LEN, R_SRC, R_SRC
B loop
callMemmove:
// if length > len(dst)-d || length > len(src)-s { etc }
CMP R_TMP0, R_LEN
BGT errCorrupt
CMP R_TMP1, R_LEN
BGT errCorrupt
// copy(dst[d:], src[s:s+length])
//
// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
// three registers to the stack, to save local variables across the CALL.
MOVD R_DST, 8(RSP)
MOVD R_SRC, 16(RSP)
MOVD R_LEN, 24(RSP)
MOVD R_DST, 32(RSP)
MOVD R_SRC, 40(RSP)
MOVD R_LEN, 48(RSP)
MOVD R_OFF, 56(RSP)
CALL runtime·memmove(SB)
// Restore local variables: unspill registers from the stack and
// re-calculate R_DBASE-R_SEND.
MOVD 32(RSP), R_DST
MOVD 40(RSP), R_SRC
MOVD 48(RSP), R_LEN
MOVD 56(RSP), R_OFF
MOVD dst_base+0(FP), R_DBASE
MOVD dst_len+8(FP), R_DLEN
MOVD R_DBASE, R_DEND
ADD R_DLEN, R_DEND, R_DEND
MOVD src_base+24(FP), R_SBASE
MOVD src_len+32(FP), R_SLEN
MOVD R_SBASE, R_SEND
ADD R_SLEN, R_SEND, R_SEND
// d += length
// s += length
ADD R_LEN, R_DST, R_DST
ADD R_LEN, R_SRC, R_SRC
B loop
tagLit60Plus:
// !!! This fragment does the
//
// s += x - 58; if uint(s) > uint(len(src)) { etc }
//
// checks. In the asm version, we code it once instead of once per switch case.
ADD R_LEN, R_SRC, R_SRC
SUB $58, R_SRC, R_SRC
TEST_SRC()
// case x == 60:
MOVW $61, R1
CMPW R1, R_LEN
BEQ tagLit61
BGT tagLit62Plus
// x = uint32(src[s-1])
MOVBU -1(R_SRC), R_LEN
B doLit
tagLit61:
// case x == 61:
// x = uint32(src[s-2]) | uint32(src[s-1])<<8
MOVHU -2(R_SRC), R_LEN
B doLit
tagLit62Plus:
CMPW $62, R_LEN
BHI tagLit63
// case x == 62:
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
MOVHU -3(R_SRC), R_LEN
MOVBU -1(R_SRC), R_TMP1
ORR R_TMP1<<16, R_LEN
B doLit
tagLit63:
// case x == 63:
// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
MOVWU -4(R_SRC), R_LEN
B doLit
// The code above handles literal tags.
// ----------------------------------------
// The code below handles copy tags.
tagCopy4:
// case tagCopy4:
// s += 5
ADD $5, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
MOVD R_SRC, R_TMP1
SUB R_SBASE, R_TMP1, R_TMP1
CMP R_SLEN, R_TMP1
BGT errCorrupt
// length = 1 + int(src[s-5])>>2
MOVD $1, R1
ADD R_LEN>>2, R1, R_LEN
// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
MOVWU -4(R_SRC), R_OFF
B doCopy
tagCopy2:
// case tagCopy2:
// s += 3
ADD $3, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
TEST_SRC()
// length = 1 + int(src[s-3])>>2
MOVD $1, R1
ADD R_LEN>>2, R1, R_LEN
// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
MOVHU -2(R_SRC), R_OFF
B doCopy
tagCopy:
// We have a copy tag. We assume that:
// - R_TMP1 == src[s] & 0x03
// - R_LEN == src[s]
CMP $2, R_TMP1
BEQ tagCopy2
BGT tagCopy4
// case tagCopy1:
// s += 2
ADD $2, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
TEST_SRC()
// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
// Calculate offset in R_TMP0 in case it is a repeat.
MOVD R_LEN, R_TMP0
AND $0xe0, R_TMP0
MOVBU -1(R_SRC), R_TMP1
ORR R_TMP0<<3, R_TMP1, R_TMP0
// length = 4 + int(src[s-2])>>2&0x7
MOVD $7, R1
AND R_LEN>>2, R1, R_LEN
ADD $4, R_LEN, R_LEN
// check if repeat code with offset 0.
CMP $0, R_TMP0
BEQ repeatCode
// This is a regular copy, transfer our temporary value to R_OFF (offset)
MOVD R_TMP0, R_OFF
B doCopy
// This is a repeat code.
repeatCode:
// If length < 9, reuse last offset, with the length already calculated.
CMP $9, R_LEN
BLT doCopyRepeat
BEQ repeatLen1
CMP $10, R_LEN
BEQ repeatLen2
repeatLen3:
// s +=3
ADD $3, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
TEST_SRC()
// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
MOVBU -1(R_SRC), R_TMP0
MOVHU -3(R_SRC), R_LEN
ORR R_TMP0<<16, R_LEN, R_LEN
ADD $65540, R_LEN, R_LEN
B doCopyRepeat
repeatLen2:
// s +=2
ADD $2, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
TEST_SRC()
// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
MOVHU -2(R_SRC), R_LEN
ADD $260, R_LEN, R_LEN
B doCopyRepeat
repeatLen1:
// s +=1
ADD $1, R_SRC, R_SRC
// if uint(s) > uint(len(src)) { etc }
TEST_SRC()
// length = src[s-1] + 8
MOVBU -1(R_SRC), R_LEN
ADD $8, R_LEN, R_LEN
B doCopyRepeat
doCopy:
// This is the end of the outer "switch", when we have a copy tag.
//
// We assume that:
// - R_LEN == length && R_LEN > 0
// - R_OFF == offset
// if d < offset { etc }
MOVD R_DST, R_TMP1
SUB R_DBASE, R_TMP1, R_TMP1
CMP R_OFF, R_TMP1
BLT errCorrupt
// Repeat values can skip the test above, since any offset > 0 will be in dst.
doCopyRepeat:
// if offset <= 0 { etc }
CMP $0, R_OFF
BLE errCorrupt
// if length > len(dst)-d { etc }
MOVD R_DEND, R_TMP1
SUB R_DST, R_TMP1, R_TMP1
CMP R_TMP1, R_LEN
BGT errCorrupt
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
//
// Set:
// - R_TMP2 = len(dst)-d
// - R_TMP3 = &dst[d-offset]
MOVD R_DEND, R_TMP2
SUB R_DST, R_TMP2, R_TMP2
MOVD R_DST, R_TMP3
SUB R_OFF, R_TMP3, R_TMP3
// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
//
// First, try using two 8-byte load/stores, similar to the doLit technique
// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
// and not one 16-byte load/store, and the first store has to be before the
// second load, due to the overlap if offset is in the range [8, 16).
//
// if length > 16 || offset < 8 || len(dst)-d < 16 {
// goto slowForwardCopy
// }
// copy 16 bytes
// d += length
CMP $16, R_LEN
BGT slowForwardCopy
CMP $8, R_OFF
BLT slowForwardCopy
CMP $16, R_TMP2
BLT slowForwardCopy
MOVD 0(R_TMP3), R_TMP0
MOVD R_TMP0, 0(R_DST)
MOVD 8(R_TMP3), R_TMP1
MOVD R_TMP1, 8(R_DST)
ADD R_LEN, R_DST, R_DST
B loop
slowForwardCopy:
// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
// can still try 8-byte load stores, provided we can overrun up to 10 extra
// bytes. As above, the overrun will be fixed up by subsequent iterations
// of the outermost loop.
//
// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
// commentary says:
//
// ----
//
// The main part of this loop is a simple copy of eight bytes at a time
// until we've copied (at least) the requested amount of bytes. However,
// if d and d-offset are less than eight bytes apart (indicating a
// repeating pattern of length < 8), we first need to expand the pattern in
// order to get the correct results. For instance, if the buffer looks like
// this, with the eight-byte <d-offset> and <d> patterns marked as
// intervals:
//
// abxxxxxxxxxxxx
// [------] d-offset
// [------] d
//
// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
// once, after which we can move <d> two bytes without moving <d-offset>:
//
// ababxxxxxxxxxx
// [------] d-offset
// [------] d
//
// and repeat the exercise until the two no longer overlap.
//
// This allows us to do very well in the special case of one single byte
// repeated many times, without taking a big hit for more general cases.
//
// The worst case of extra writing past the end of the match occurs when
// offset == 1 and length == 1; the last copy will read from byte positions
// [0..7] and write to [4..11], whereas it was only supposed to write to
// position 1. Thus, ten excess bytes.
//
// ----
//
// That "10 byte overrun" worst case is confirmed by Go's
// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
// and finishSlowForwardCopy algorithm.
//
// if length > len(dst)-d-10 {
// goto verySlowForwardCopy
// }
SUB $10, R_TMP2, R_TMP2
CMP R_TMP2, R_LEN
BGT verySlowForwardCopy
// We want to keep the offset, so we use R_TMP2 from here.
MOVD R_OFF, R_TMP2
makeOffsetAtLeast8:
// !!! As above, expand the pattern so that offset >= 8 and we can use
// 8-byte load/stores.
//
// for offset < 8 {
// copy 8 bytes from dst[d-offset:] to dst[d:]
// length -= offset
// d += offset
// offset += offset
// // The two previous lines together means that d-offset, and therefore
// // R_TMP3, is unchanged.
// }
CMP $8, R_TMP2
BGE fixUpSlowForwardCopy
MOVD (R_TMP3), R_TMP1
MOVD R_TMP1, (R_DST)
SUB R_TMP2, R_LEN, R_LEN
ADD R_TMP2, R_DST, R_DST
ADD R_TMP2, R_TMP2, R_TMP2
B makeOffsetAtLeast8
fixUpSlowForwardCopy:
// !!! Add length (which might be negative now) to d (implied by R_DST being
// &dst[d]) so that d ends up at the right place when we jump back to the
// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
// length is positive, copying the remaining length bytes will write to the
// right place.
MOVD R_DST, R_TMP0
ADD R_LEN, R_DST, R_DST
finishSlowForwardCopy:
// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
// length means that we overrun, but as above, that will be fixed up by
// subsequent iterations of the outermost loop.
MOVD $0, R1
CMP R1, R_LEN
BLE loop
MOVD (R_TMP3), R_TMP1
MOVD R_TMP1, (R_TMP0)
ADD $8, R_TMP3, R_TMP3
ADD $8, R_TMP0, R_TMP0
SUB $8, R_LEN, R_LEN
B finishSlowForwardCopy
verySlowForwardCopy:
// verySlowForwardCopy is a simple implementation of forward copy. In C
// parlance, this is a do/while loop instead of a while loop, since we know
// that length > 0. In Go syntax:
//
// for {
// dst[d] = dst[d - offset]
// d++
// length--
// if length == 0 {
// break
// }
// }
MOVB (R_TMP3), R_TMP1
MOVB R_TMP1, (R_DST)
ADD $1, R_TMP3, R_TMP3
ADD $1, R_DST, R_DST
SUB $1, R_LEN, R_LEN
CBNZ R_LEN, verySlowForwardCopy
B loop
// The code above handles copy tags.
// ----------------------------------------
end:
// This is the end of the "for s < len(src)".
//
// if d != len(dst) { etc }
CMP R_DEND, R_DST
BNE errCorrupt
// return 0
MOVD $0, ret+48(FP)
RET
errCorrupt:
// return decodeErrCodeCorrupt
MOVD $1, R_TMP0
MOVD R_TMP0, ret+48(FP)
RET

17
vendor/github.com/klauspost/compress/s2/decode_asm.go generated vendored Normal file
View file

@ -0,0 +1,17 @@
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (amd64 || arm64) && !appengine && gc && !noasm
// +build amd64 arm64
// +build !appengine
// +build gc
// +build !noasm
package s2
// decode has the same semantics as in decode_other.go.
//
//go:noescape
func s2Decode(dst, src []byte) int

267
vendor/github.com/klauspost/compress/s2/decode_other.go generated vendored Normal file
View file

@ -0,0 +1,267 @@
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !arm64) || appengine || !gc || noasm
// +build !amd64,!arm64 appengine !gc noasm
package s2
import (
"fmt"
"strconv"
)
// decode writes the decoding of src to dst. It assumes that the varint-encoded
// length of the decompressed bytes has already been read, and that len(dst)
// equals that length.
//
// It returns 0 on success or a decodeErrCodeXxx error code on failure.
func s2Decode(dst, src []byte) int {
const debug = false
if debug {
fmt.Println("Starting decode, dst len:", len(dst))
}
var d, s, length int
offset := 0
// As long as we can read at least 5 bytes...
for s < len(src)-5 {
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
switch {
case x < 60:
s++
case x == 60:
s += 2
x = uint32(src[s-1])
case x == 61:
s += 3
x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
s += 4
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
s += 5
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
return decodeErrCodeCorrupt
}
if debug {
fmt.Println("literals, length:", length, "d-after:", d+length)
}
copy(dst[d:], src[s:s+length])
d += length
s += length
continue
case tagCopy1:
s += 2
length = int(src[s-2]) >> 2 & 0x7
toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
if toffset == 0 {
if debug {
fmt.Print("(repeat) ")
}
// keep last offset
switch length {
case 5:
s += 1
length = int(uint32(src[s-1])) + 4
case 6:
s += 2
length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
case 7:
s += 3
length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
default: // 0-> 4
}
} else {
offset = toffset
}
length += 4
case tagCopy2:
s += 3
length = 1 + int(src[s-3])>>2
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
s += 5
length = 1 + int(src[s-5])>>2
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
return decodeErrCodeCorrupt
}
if debug {
fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
}
// Copy from an earlier sub-slice of dst to a later sub-slice.
// If no overlap, use the built-in copy:
if offset > length {
copy(dst[d:d+length], dst[d-offset:])
d += length
continue
}
// Unlike the built-in copy function, this byte-by-byte copy always runs
// forwards, even if the slices overlap. Conceptually, this is:
//
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
//
// We align the slices into a and b and show the compiler they are the same size.
// This allows the loop to run without bounds checks.
a := dst[d : d+length]
b := dst[d-offset:]
b = b[:len(a)]
for i := range a {
a[i] = b[i]
}
d += length
}
// Remaining with extra checks...
for s < len(src) {
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
switch {
case x < 60:
s++
case x == 60:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-1])
case x == 61:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
s += 4
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
return decodeErrCodeCorrupt
}
if debug {
fmt.Println("literals, length:", length, "d-after:", d+length)
}
copy(dst[d:], src[s:s+length])
d += length
s += length
continue
case tagCopy1:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = int(src[s-2]) >> 2 & 0x7
toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
if toffset == 0 {
if debug {
fmt.Print("(repeat) ")
}
// keep last offset
switch length {
case 5:
s += 1
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = int(uint32(src[s-1])) + 4
case 6:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
case 7:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
default: // 0-> 4
}
} else {
offset = toffset
}
length += 4
case tagCopy2:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-3])>>2
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-5])>>2
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
return decodeErrCodeCorrupt
}
if debug {
fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
}
// Copy from an earlier sub-slice of dst to a later sub-slice.
// If no overlap, use the built-in copy:
if offset > length {
copy(dst[d:d+length], dst[d-offset:])
d += length
continue
}
// Unlike the built-in copy function, this byte-by-byte copy always runs
// forwards, even if the slices overlap. Conceptually, this is:
//
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
//
// We align the slices into a and b and show the compiler they are the same size.
// This allows the loop to run without bounds checks.
a := dst[d : d+length]
b := dst[d-offset:]
b = b[:len(a)]
for i := range a {
a[i] = b[i]
}
d += length
}
if d != len(dst) {
return decodeErrCodeCorrupt
}
return 0
}

1341
vendor/github.com/klauspost/compress/s2/encode.go generated vendored Normal file

File diff suppressed because it is too large Load diff

456
vendor/github.com/klauspost/compress/s2/encode_all.go generated vendored Normal file
View file

@ -0,0 +1,456 @@
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package s2
import (
"bytes"
"encoding/binary"
"math/bits"
)
func load32(b []byte, i int) uint32 {
return binary.LittleEndian.Uint32(b[i:])
}
func load64(b []byte, i int) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash6(u uint64, h uint8) uint32 {
const prime6bytes = 227718039650203
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
}
func encodeGo(dst, src []byte) []byte {
if n := MaxEncodedLen(len(src)); n < 0 {
panic(ErrTooLarge)
} else if len(dst) < n {
dst = make([]byte, n)
}
// The block starts with the varint-encoded length of the decompressed bytes.
d := binary.PutUvarint(dst, uint64(len(src)))
if len(src) == 0 {
return dst[:d]
}
if len(src) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], src)
return dst[:d]
}
n := encodeBlockGo(dst[d:], src)
if n > 0 {
d += n
return dst[:d]
}
// Not compressible
d += emitLiteral(dst[d:], src)
return dst[:d]
}
// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockGo(dst, src []byte) (d int) {
// Initialize the hash table.
const (
tableBits = 14
maxTableSize = 1 << tableBits
debug = false
)
var table [maxTableSize]uint32
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// Bail if we can't compress to at least this.
dstLimit := len(src) - len(src)>>5 - 5
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We search for a repeat at -1, but don't output repeats when nextEmit == 0
repeat := 1
for {
candidate := 0
for {
// Next src position to check
nextS := s + (s-nextEmit)>>6 + 4
if nextS > sLimit {
goto emitRemainder
}
hash0 := hash6(cv, tableBits)
hash1 := hash6(cv>>8, tableBits)
candidate = int(table[hash0])
candidate2 := int(table[hash1])
table[hash0] = uint32(s)
table[hash1] = uint32(s + 1)
hash2 := hash6(cv>>16, tableBits)
// Check repeat at offset checkRep.
const checkRep = 1
if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
i--
base--
}
d += emitLiteral(dst[d:], src[nextEmit:base])
// Extend forward
candidate := s - repeat + 4 + checkRep
s += 4 + checkRep
for s <= sLimit {
if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidate += 8
}
if debug {
// Validate match.
if s <= candidate {
panic("s <= candidate")
}
a := src[base:s]
b := src[base-repeat : base-repeat+(s-base)]
if !bytes.Equal(a, b) {
panic("mismatch")
}
}
if nextEmit > 0 {
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
} else {
// First match, cannot be repeat.
d += emitCopy(dst[d:], repeat, s-base)
}
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
if uint32(cv) == load32(src, candidate) {
break
}
candidate = int(table[hash2])
if uint32(cv>>8) == load32(src, candidate2) {
table[hash2] = uint32(s + 2)
candidate = candidate2
s++
break
}
table[hash2] = uint32(s + 2)
if uint32(cv>>16) == load32(src, candidate) {
s += 2
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards.
// The top bytes will be rechecked to get the full match.
for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
candidate--
s--
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
// A 4-byte match has been found. We'll later see if more than 4 bytes
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.
d += emitLiteral(dst[d:], src[nextEmit:s])
// Call emitCopy, and then see if another emitCopy could be our next
// move. Repeat until we find no match for the input immediately after
// what was consumed by the last emitCopy call.
//
// If we exit this loop normally then we need to call emitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can
// exit this loop via goto if we get close to exhausting the input.
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
repeat = base - candidate
// Extend the 4-byte match as long as possible.
s += 4
candidate += 4
for s <= len(src)-8 {
if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidate += 8
}
d += emitCopy(dst[d:], repeat, s-base)
if debug {
// Validate match.
if s <= candidate {
panic("s <= candidate")
}
a := src[base:s]
b := src[base-repeat : base-repeat+(s-base)]
if !bytes.Equal(a, b) {
panic("mismatch")
}
}
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Check for an immediate match, otherwise start search at s+1
x := load64(src, s-2)
m2Hash := hash6(x, tableBits)
currHash := hash6(x>>16, tableBits)
candidate = int(table[currHash])
table[m2Hash] = uint32(s - 2)
table[currHash] = uint32(s)
if debug && s == candidate {
panic("s == candidate")
}
if uint32(x>>16) != load32(src, candidate) {
cv = load64(src, s+1)
s++
break
}
}
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
func encodeBlockSnappyGo(dst, src []byte) (d int) {
// Initialize the hash table.
const (
tableBits = 14
maxTableSize = 1 << tableBits
)
var table [maxTableSize]uint32
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// Bail if we can't compress to at least this.
dstLimit := len(src) - len(src)>>5 - 5
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We search for a repeat at -1, but don't output repeats when nextEmit == 0
repeat := 1
for {
candidate := 0
for {
// Next src position to check
nextS := s + (s-nextEmit)>>6 + 4
if nextS > sLimit {
goto emitRemainder
}
hash0 := hash6(cv, tableBits)
hash1 := hash6(cv>>8, tableBits)
candidate = int(table[hash0])
candidate2 := int(table[hash1])
table[hash0] = uint32(s)
table[hash1] = uint32(s + 1)
hash2 := hash6(cv>>16, tableBits)
// Check repeat at offset checkRep.
const checkRep = 1
if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
i--
base--
}
d += emitLiteral(dst[d:], src[nextEmit:base])
// Extend forward
candidate := s - repeat + 4 + checkRep
s += 4 + checkRep
for s <= sLimit {
if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidate += 8
}
d += emitCopyNoRepeat(dst[d:], repeat, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
if uint32(cv) == load32(src, candidate) {
break
}
candidate = int(table[hash2])
if uint32(cv>>8) == load32(src, candidate2) {
table[hash2] = uint32(s + 2)
candidate = candidate2
s++
break
}
table[hash2] = uint32(s + 2)
if uint32(cv>>16) == load32(src, candidate) {
s += 2
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards
for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
candidate--
s--
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
// A 4-byte match has been found. We'll later see if more than 4 bytes
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.
d += emitLiteral(dst[d:], src[nextEmit:s])
// Call emitCopy, and then see if another emitCopy could be our next
// move. Repeat until we find no match for the input immediately after
// what was consumed by the last emitCopy call.
//
// If we exit this loop normally then we need to call emitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can
// exit this loop via goto if we get close to exhausting the input.
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
repeat = base - candidate
// Extend the 4-byte match as long as possible.
s += 4
candidate += 4
for s <= len(src)-8 {
if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidate += 8
}
d += emitCopyNoRepeat(dst[d:], repeat, s-base)
if false {
// Validate match.
a := src[base:s]
b := src[base-repeat : base-repeat+(s-base)]
if !bytes.Equal(a, b) {
panic("mismatch")
}
}
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Check for an immediate match, otherwise start search at s+1
x := load64(src, s-2)
m2Hash := hash6(x, tableBits)
currHash := hash6(x>>16, tableBits)
candidate = int(table[currHash])
table[m2Hash] = uint32(s - 2)
table[currHash] = uint32(s)
if uint32(x>>16) != load32(src, candidate) {
cv = load64(src, s+1)
s++
break
}
}
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}

142
vendor/github.com/klauspost/compress/s2/encode_amd64.go generated vendored Normal file
View file

@ -0,0 +1,142 @@
//go:build !appengine && !noasm && gc
// +build !appengine,!noasm,gc
package s2
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
limit12B = 16 << 10
// Use 10 bit table when less than...
limit10B = 4 << 10
// Use 8 bit table when less than...
limit8B = 512
)
if len(src) >= 4<<20 {
return encodeBlockAsm(dst, src)
}
if len(src) >= limit12B {
return encodeBlockAsm4MB(dst, src)
}
if len(src) >= limit10B {
return encodeBlockAsm12B(dst, src)
}
if len(src) >= limit8B {
return encodeBlockAsm10B(dst, src)
}
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeBlockAsm8B(dst, src)
}
// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetter(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
limit12B = 16 << 10
// Use 10 bit table when less than...
limit10B = 4 << 10
// Use 8 bit table when less than...
limit8B = 512
)
if len(src) > 4<<20 {
return encodeBetterBlockAsm(dst, src)
}
if len(src) >= limit12B {
return encodeBetterBlockAsm4MB(dst, src)
}
if len(src) >= limit10B {
return encodeBetterBlockAsm12B(dst, src)
}
if len(src) >= limit8B {
return encodeBetterBlockAsm10B(dst, src)
}
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeBetterBlockAsm8B(dst, src)
}
// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
limit12B = 16 << 10
// Use 10 bit table when less than...
limit10B = 4 << 10
// Use 8 bit table when less than...
limit8B = 512
)
if len(src) >= 64<<10 {
return encodeSnappyBlockAsm(dst, src)
}
if len(src) >= limit12B {
return encodeSnappyBlockAsm64K(dst, src)
}
if len(src) >= limit10B {
return encodeSnappyBlockAsm12B(dst, src)
}
if len(src) >= limit8B {
return encodeSnappyBlockAsm10B(dst, src)
}
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeSnappyBlockAsm8B(dst, src)
}
// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
limit12B = 16 << 10
// Use 10 bit table when less than...
limit10B = 4 << 10
// Use 8 bit table when less than...
limit8B = 512
)
if len(src) >= 64<<10 {
return encodeSnappyBetterBlockAsm(dst, src)
}
if len(src) >= limit12B {
return encodeSnappyBetterBlockAsm64K(dst, src)
}
if len(src) >= limit10B {
return encodeSnappyBetterBlockAsm12B(dst, src)
}
if len(src) >= limit8B {
return encodeSnappyBetterBlockAsm10B(dst, src)
}
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeSnappyBetterBlockAsm8B(dst, src)
}

630
vendor/github.com/klauspost/compress/s2/encode_best.go generated vendored Normal file
View file

@ -0,0 +1,630 @@
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package s2
import (
"fmt"
"math/bits"
)
// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBest(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 19
maxLTableSize = 1 << lTableBits
// Short hash matches.
sTableBits = 16
maxSTableSize = 1 << sTableBits
inputMargin = 8 + 2
)
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
if len(src) < minNonLiteralBlockSize {
return 0
}
var lTable [maxLTableSize]uint64
var sTable [maxSTableSize]uint64
// Bail if we can't compress to at least this.
dstLimit := len(src) - 5
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We search for a repeat at -1, but don't output repeats when nextEmit == 0
repeat := 1
const lowbitMask = 0xffffffff
getCur := func(x uint64) int {
return int(x & lowbitMask)
}
getPrev := func(x uint64) int {
return int(x >> 32)
}
const maxSkip = 64
for {
type match struct {
offset int
s int
length int
score int
rep bool
}
var best match
for {
// Next src position to check
nextS := (s-nextEmit)>>8 + 1
if nextS > maxSkip {
nextS = s + maxSkip
} else {
nextS += s
}
if nextS > sLimit {
goto emitRemainder
}
hashL := hash8(cv, lTableBits)
hashS := hash4(cv, sTableBits)
candidateL := lTable[hashL]
candidateS := sTable[hashS]
score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
}
return score - emitCopySize(offset, m.length)
}
matchAt := func(offset, s int, first uint32, rep bool) match {
if best.length != 0 && best.s-best.offset == s-offset {
// Don't retest if we have the same offset.
return match{offset: offset, s: s}
}
if load32(src, offset) != first {
return match{offset: offset, s: s}
}
m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
s += 4
for s <= sLimit {
if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
m.length += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
m.length += 8
}
m.length -= offset
m.score = score(m)
if m.score <= -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return m
}
bestOf := func(a, b match) match {
if b.length == 0 {
return a
}
if a.length == 0 {
return b
}
as := a.score + b.s
bs := b.score + a.s
if as >= bs {
return a
}
return b
}
best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
{
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
if best.length > 0 {
// s+1
nextShort := sTable[hash4(cv>>8, sTableBits)]
s := s + 1
cv := load64(src, s)
nextLong := lTable[hash8(cv, lTableBits)]
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
// Repeat at + 2
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
// s+2
if true {
nextShort = sTable[hash4(cv>>8, sTableBits)]
s++
cv = load64(src, s)
nextLong = lTable[hash8(cv, lTableBits)]
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
}
// Search for a match at best match end, see if that is better.
if sAt := best.s + best.length; sAt < sLimit {
sBack := best.s
backL := best.length
// Load initial values
cv = load64(src, sBack)
// Search for mismatch
next := lTable[hash8(load64(src, sAt), lTableBits)]
//next := sTable[hash4(load64(src, sAt), sTableBits)]
if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
}
}
}
// Update table
lTable[hashL] = uint64(s) | candidateL<<32
sTable[hashS] = uint64(s) | candidateS<<32
if best.length > 0 {
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards, not needed for repeats...
s = best.s
if !best.rep {
for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
best.offset--
best.length++
s--
}
}
if false && best.offset >= s {
panic(fmt.Errorf("t %d >= s %d", best.offset, s))
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
base := s
offset := s - best.offset
s += best.length
if offset > 65535 && s-base <= 5 && !best.rep {
// Bail if the match is equal or worse to the encoding.
s = best.s + 1
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
d += emitLiteral(dst[d:], src[nextEmit:base])
if best.rep {
if nextEmit > 0 {
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], offset, best.length)
} else {
// First match, cannot be repeat.
d += emitCopy(dst[d:], offset, best.length)
}
} else {
d += emitCopy(dst[d:], offset, best.length)
}
repeat = offset
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Fill tables...
for i := best.s + 1; i < s; i++ {
cv0 := load64(src, i)
long0 := hash8(cv0, lTableBits)
short0 := hash4(cv0, sTableBits)
lTable[long0] = uint64(i) | lTable[long0]<<32
sTable[short0] = uint64(i) | sTable[short0]<<32
}
cv = load64(src, s)
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBestSnappy(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 19
maxLTableSize = 1 << lTableBits
// Short hash matches.
sTableBits = 16
maxSTableSize = 1 << sTableBits
inputMargin = 8 + 2
)
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
if len(src) < minNonLiteralBlockSize {
return 0
}
var lTable [maxLTableSize]uint64
var sTable [maxSTableSize]uint64
// Bail if we can't compress to at least this.
dstLimit := len(src) - 5
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We search for a repeat at -1, but don't output repeats when nextEmit == 0
repeat := 1
const lowbitMask = 0xffffffff
getCur := func(x uint64) int {
return int(x & lowbitMask)
}
getPrev := func(x uint64) int {
return int(x >> 32)
}
const maxSkip = 64
for {
type match struct {
offset int
s int
length int
score int
}
var best match
for {
// Next src position to check
nextS := (s-nextEmit)>>8 + 1
if nextS > maxSkip {
nextS = s + maxSkip
} else {
nextS += s
}
if nextS > sLimit {
goto emitRemainder
}
hashL := hash8(cv, lTableBits)
hashS := hash4(cv, sTableBits)
candidateL := lTable[hashL]
candidateS := sTable[hashS]
score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
return score - emitCopyNoRepeatSize(offset, m.length)
}
matchAt := func(offset, s int, first uint32) match {
if best.length != 0 && best.s-best.offset == s-offset {
// Don't retest if we have the same offset.
return match{offset: offset, s: s}
}
if load32(src, offset) != first {
return match{offset: offset, s: s}
}
m := match{offset: offset, s: s, length: 4 + offset}
s += 4
for s <= sLimit {
if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
m.length += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
m.length += 8
}
m.length -= offset
m.score = score(m)
if m.score <= -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return m
}
bestOf := func(a, b match) match {
if b.length == 0 {
return a
}
if a.length == 0 {
return b
}
as := a.score + b.s
bs := b.score + a.s
if as >= bs {
return a
}
return b
}
best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
{
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
if best.length > 0 {
// s+1
nextShort := sTable[hash4(cv>>8, sTableBits)]
s := s + 1
cv := load64(src, s)
nextLong := lTable[hash8(cv, lTableBits)]
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
// Repeat at + 2
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
// s+2
if true {
nextShort = sTable[hash4(cv>>8, sTableBits)]
s++
cv = load64(src, s)
nextLong = lTable[hash8(cv, lTableBits)]
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
}
// Search for a match at best match end, see if that is better.
if sAt := best.s + best.length; sAt < sLimit {
sBack := best.s
backL := best.length
// Load initial values
cv = load64(src, sBack)
// Search for mismatch
next := lTable[hash8(load64(src, sAt), lTableBits)]
//next := sTable[hash4(load64(src, sAt), sTableBits)]
if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
}
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
}
}
}
}
// Update table
lTable[hashL] = uint64(s) | candidateL<<32
sTable[hashS] = uint64(s) | candidateS<<32
if best.length > 0 {
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards, not needed for repeats...
s = best.s
if true {
for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
best.offset--
best.length++
s--
}
}
if false && best.offset >= s {
panic(fmt.Errorf("t %d >= s %d", best.offset, s))
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
base := s
offset := s - best.offset
s += best.length
if offset > 65535 && s-base <= 5 {
// Bail if the match is equal or worse to the encoding.
s = best.s + 1
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
d += emitLiteral(dst[d:], src[nextEmit:base])
d += emitCopyNoRepeat(dst[d:], offset, best.length)
repeat = offset
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Fill tables...
for i := best.s + 1; i < s; i++ {
cv0 := load64(src, i)
long0 := hash8(cv0, lTableBits)
short0 := hash4(cv0, sTableBits)
lTable[long0] = uint64(i) | lTable[long0]<<32
sTable[short0] = uint64(i) | sTable[short0]<<32
}
cv = load64(src, s)
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
// emitCopySize returns the size to encode the offset+length
//
// It assumes that:
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopySize(offset, length int) int {
if offset >= 65536 {
i := 0
if length > 64 {
length -= 64
if length >= 4 {
// Emit remaining as repeats
return 5 + emitRepeatSize(offset, length)
}
i = 5
}
if length == 0 {
return i
}
return i + 5
}
// Offset no more than 2 bytes.
if length > 64 {
if offset < 2048 {
// Emit 8 bytes, then rest as repeats...
return 2 + emitRepeatSize(offset, length-8)
}
// Emit remaining as repeats, at least 4 bytes remain.
return 3 + emitRepeatSize(offset, length-60)
}
if length >= 12 || offset >= 2048 {
return 3
}
// Emit the remaining copy, encoded as 2 bytes.
return 2
}
// emitCopyNoRepeatSize returns the size to encode the offset+length
//
// It assumes that:
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopyNoRepeatSize(offset, length int) int {
if offset >= 65536 {
return 5 + 5*(length/64)
}
// Offset no more than 2 bytes.
if length > 64 {
// Emit remaining as repeats, at least 4 bytes remain.
return 3 + 3*(length/60)
}
if length >= 12 || offset >= 2048 {
return 3
}
// Emit the remaining copy, encoded as 2 bytes.
return 2
}
// emitRepeatSize returns the number of bytes required to encode a repeat.
// Length must be at least 4 and < 1<<24
func emitRepeatSize(offset, length int) int {
// Repeat offset, make length cheaper
if length <= 4+4 || (length < 8+4 && offset < 2048) {
return 2
}
if length < (1<<8)+4+4 {
return 3
}
if length < (1<<16)+(1<<8)+4 {
return 4
}
const maxRepeat = (1 << 24) - 1
length -= (1 << 16) - 4
left := 0
if length > maxRepeat {
left = length - maxRepeat + 4
length = maxRepeat - 4
}
if left > 0 {
return 5 + emitRepeatSize(offset, left)
}
return 5
}

View file

@ -0,0 +1,431 @@
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package s2
import (
"math/bits"
)
// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4(u uint64, h uint8) uint32 {
const prime4bytes = 2654435761
return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
}
// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash5(u uint64, h uint8) uint32 {
const prime5bytes = 889523592379
return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
}
// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash7(u uint64, h uint8) uint32 {
const prime7bytes = 58295818150454627
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
}
// hash8 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash8(u uint64, h uint8) uint32 {
const prime8bytes = 0xcf1bbcdcb7a56463
return uint32((u * prime8bytes) >> ((64 - h) & 63))
}
// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
if len(src) < minNonLiteralBlockSize {
return 0
}
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 16
maxLTableSize = 1 << lTableBits
// Short hash matches.
sTableBits = 14
maxSTableSize = 1 << sTableBits
)
var lTable [maxLTableSize]uint32
var sTable [maxSTableSize]uint32
// Bail if we can't compress to at least this.
dstLimit := len(src) - len(src)>>5 - 6
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We initialize repeat to 0, so we never match on first attempt
repeat := 0
for {
candidateL := 0
nextS := 0
for {
// Next src position to check
nextS = s + (s-nextEmit)>>7 + 1
if nextS > sLimit {
goto emitRemainder
}
hashL := hash7(cv, lTableBits)
hashS := hash4(cv, sTableBits)
candidateL = int(lTable[hashL])
candidateS := int(sTable[hashS])
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)
// Check repeat at offset checkRep.
const checkRep = 1
if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
i--
base--
}
d += emitLiteral(dst[d:], src[nextEmit:base])
// Extend forward
candidate := s - repeat + 4 + checkRep
s += 4 + checkRep
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidate] {
s++
candidate++
continue
}
break
}
if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidate += 8
}
if nextEmit > 0 {
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
} else {
// First match, cannot be repeat.
d += emitCopy(dst[d:], repeat, s-base)
}
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
if uint32(cv) == load32(src, candidateL) {
break
}
// Check our short candidate
if uint32(cv) == load32(src, candidateS) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
lTable[hashL] = uint32(s + 1)
if uint32(cv>>8) == load32(src, candidateL) {
s++
break
}
// Use our short candidate.
candidateL = candidateS
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards
for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
candidateL--
s--
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
base := s
offset := base - candidateL
// Extend the 4-byte match as long as possible.
s += 4
candidateL += 4
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidateL] {
s++
candidateL++
continue
}
break
}
if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidateL += 8
}
if offset > 65535 && s-base <= 5 && repeat != offset {
// Bail if the match is equal or worse to the encoding.
s = nextS + 1
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
d += emitLiteral(dst[d:], src[nextEmit:base])
if repeat == offset {
d += emitRepeat(dst[d:], offset, s-base)
} else {
d += emitCopy(dst[d:], offset, s-base)
repeat = offset
}
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
if len(src) < minNonLiteralBlockSize {
return 0
}
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 16
maxLTableSize = 1 << lTableBits
// Short hash matches.
sTableBits = 14
maxSTableSize = 1 << sTableBits
)
var lTable [maxLTableSize]uint32
var sTable [maxSTableSize]uint32
// Bail if we can't compress to at least this.
dstLimit := len(src) - len(src)>>5 - 6
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
cv := load64(src, s)
// We initialize repeat to 0, so we never match on first attempt
repeat := 0
const maxSkip = 100
for {
candidateL := 0
nextS := 0
for {
// Next src position to check
nextS = (s-nextEmit)>>7 + 1
if nextS > maxSkip {
nextS = s + maxSkip
} else {
nextS += s
}
if nextS > sLimit {
goto emitRemainder
}
hashL := hash7(cv, lTableBits)
hashS := hash4(cv, sTableBits)
candidateL = int(lTable[hashL])
candidateS := int(sTable[hashS])
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)
if uint32(cv) == load32(src, candidateL) {
break
}
// Check our short candidate
if uint32(cv) == load32(src, candidateS) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
lTable[hashL] = uint32(s + 1)
if uint32(cv>>8) == load32(src, candidateL) {
s++
break
}
// Use our short candidate.
candidateL = candidateS
break
}
cv = load64(src, nextS)
s = nextS
}
// Extend backwards
for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
candidateL--
s--
}
// Bail if we exceed the maximum size.
if d+(s-nextEmit) > dstLimit {
return 0
}
base := s
offset := base - candidateL
// Extend the 4-byte match as long as possible.
s += 4
candidateL += 4
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidateL] {
s++
candidateL++
continue
}
break
}
if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
s += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
candidateL += 8
}
if offset > 65535 && s-base <= 5 && repeat != offset {
// Bail if the match is equal or worse to the encoding.
s = nextS + 1
if s >= sLimit {
goto emitRemainder
}
cv = load64(src, s)
continue
}
d += emitLiteral(dst[d:], src[nextEmit:base])
d += emitCopyNoRepeat(dst[d:], offset, s-base)
repeat = offset
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if d > dstLimit {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
}
emitRemainder:
if nextEmit < len(src) {
// Bail if we exceed the maximum size.
if d+len(src)-nextEmit > dstLimit {
return 0
}
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}

307
vendor/github.com/klauspost/compress/s2/encode_go.go generated vendored Normal file
View file

@ -0,0 +1,307 @@
//go:build !amd64 || appengine || !gc || noasm
// +build !amd64 appengine !gc noasm
package s2
import (
"math/bits"
)
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlock(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeBlockGo(dst, src)
}
// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetter(dst, src []byte) (d int) {
return encodeBlockBetterGo(dst, src)
}
// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
return encodeBlockBetterSnappyGo(dst, src)
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockSnappy(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
return 0
}
return encodeBlockSnappyGo(dst, src)
}
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 0 <= len(lit) && len(lit) <= math.MaxUint32
func emitLiteral(dst, lit []byte) int {
if len(lit) == 0 {
return 0
}
const num = 63<<2 | tagLiteral
i, n := 0, uint(len(lit)-1)
switch {
case n < 60:
dst[0] = uint8(n)<<2 | tagLiteral
i = 1
case n < 1<<8:
dst[1] = uint8(n)
dst[0] = 60<<2 | tagLiteral
i = 2
case n < 1<<16:
dst[2] = uint8(n >> 8)
dst[1] = uint8(n)
dst[0] = 61<<2 | tagLiteral
i = 3
case n < 1<<24:
dst[3] = uint8(n >> 16)
dst[2] = uint8(n >> 8)
dst[1] = uint8(n)
dst[0] = 62<<2 | tagLiteral
i = 4
default:
dst[4] = uint8(n >> 24)
dst[3] = uint8(n >> 16)
dst[2] = uint8(n >> 8)
dst[1] = uint8(n)
dst[0] = 63<<2 | tagLiteral
i = 5
}
return i + copy(dst[i:], lit)
}
// emitRepeat writes a repeat chunk and returns the number of bytes written.
// Length must be at least 4 and < 1<<24
func emitRepeat(dst []byte, offset, length int) int {
// Repeat offset, make length cheaper
length -= 4
if length <= 4 {
dst[0] = uint8(length)<<2 | tagCopy1
dst[1] = 0
return 2
}
if length < 8 && offset < 2048 {
// Encode WITH offset
dst[1] = uint8(offset)
dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
return 2
}
if length < (1<<8)+4 {
length -= 4
dst[2] = uint8(length)
dst[1] = 0
dst[0] = 5<<2 | tagCopy1
return 3
}
if length < (1<<16)+(1<<8) {
length -= 1 << 8
dst[3] = uint8(length >> 8)
dst[2] = uint8(length >> 0)
dst[1] = 0
dst[0] = 6<<2 | tagCopy1
return 4
}
const maxRepeat = (1 << 24) - 1
length -= 1 << 16
left := 0
if length > maxRepeat {
left = length - maxRepeat + 4
length = maxRepeat - 4
}
dst[4] = uint8(length >> 16)
dst[3] = uint8(length >> 8)
dst[2] = uint8(length >> 0)
dst[1] = 0
dst[0] = 7<<2 | tagCopy1
if left > 0 {
return 5 + emitRepeat(dst[5:], offset, left)
}
return 5
}
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopy(dst []byte, offset, length int) int {
if offset >= 65536 {
i := 0
if length > 64 {
// Emit a length 64 copy, encoded as 5 bytes.
dst[4] = uint8(offset >> 24)
dst[3] = uint8(offset >> 16)
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = 63<<2 | tagCopy4
length -= 64
if length >= 4 {
// Emit remaining as repeats
return 5 + emitRepeat(dst[5:], offset, length)
}
i = 5
}
if length == 0 {
return i
}
// Emit a copy, offset encoded as 4 bytes.
dst[i+0] = uint8(length-1)<<2 | tagCopy4
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
dst[i+3] = uint8(offset >> 16)
dst[i+4] = uint8(offset >> 24)
return i + 5
}
// Offset no more than 2 bytes.
if length > 64 {
off := 3
if offset < 2048 {
// emit 8 bytes as tagCopy1, rest as repeats.
dst[1] = uint8(offset)
dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
length -= 8
off = 2
} else {
// Emit a length 60 copy, encoded as 3 bytes.
// Emit remaining as repeat value (minimum 4 bytes).
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = 59<<2 | tagCopy2
length -= 60
}
// Emit remaining as repeats, at least 4 bytes remain.
return off + emitRepeat(dst[off:], offset, length)
}
if length >= 12 || offset >= 2048 {
// Emit the remaining copy, encoded as 3 bytes.
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = uint8(length-1)<<2 | tagCopy2
return 3
}
// Emit the remaining copy, encoded as 2 bytes.
dst[1] = uint8(offset)
dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
return 2
}
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopyNoRepeat(dst []byte, offset, length int) int {
if offset >= 65536 {
i := 0
if length > 64 {
// Emit a length 64 copy, encoded as 5 bytes.
dst[4] = uint8(offset >> 24)
dst[3] = uint8(offset >> 16)
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = 63<<2 | tagCopy4
length -= 64
if length >= 4 {
// Emit remaining as repeats
return 5 + emitCopyNoRepeat(dst[5:], offset, length)
}
i = 5
}
if length == 0 {
return i
}
// Emit a copy, offset encoded as 4 bytes.
dst[i+0] = uint8(length-1)<<2 | tagCopy4
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
dst[i+3] = uint8(offset >> 16)
dst[i+4] = uint8(offset >> 24)
return i + 5
}
// Offset no more than 2 bytes.
if length > 64 {
// Emit a length 60 copy, encoded as 3 bytes.
// Emit remaining as repeat value (minimum 4 bytes).
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = 59<<2 | tagCopy2
length -= 60
// Emit remaining as repeats, at least 4 bytes remain.
return 3 + emitCopyNoRepeat(dst[3:], offset, length)
}
if length >= 12 || offset >= 2048 {
// Emit the remaining copy, encoded as 3 bytes.
dst[2] = uint8(offset >> 8)
dst[1] = uint8(offset)
dst[0] = uint8(length-1)<<2 | tagCopy2
return 3
}
// Emit the remaining copy, encoded as 2 bytes.
dst[1] = uint8(offset)
dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
return 2
}
// matchLen returns how many bytes match in a and b
//
// It assumes that:
// len(a) <= len(b)
//
func matchLen(a []byte, b []byte) int {
b = b[:len(a)]
var checked int
if len(a) > 4 {
// Try 4 bytes first
if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
return bits.TrailingZeros32(diff) >> 3
}
// Switch to 8 byte matching.
checked = 4
a = a[4:]
b = b[4:]
for len(a) >= 8 {
b = b[:len(a)]
if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
return checked + (bits.TrailingZeros64(diff) >> 3)
}
checked += 8
a = a[8:]
b = b[8:]
}
}
b = b[:len(a)]
for i := range a {
if a[i] != b[i] {
return int(i) + checked
}
}
return len(a) + checked
}

View file

@ -0,0 +1,191 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm
package s2
func _dummy_()
// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4294967295 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBlockAsm(dst []byte, src []byte) int
// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4194304 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBlockAsm4MB(dst []byte, src []byte) int
// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 16383 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBlockAsm12B(dst []byte, src []byte) int
// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4095 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBlockAsm10B(dst []byte, src []byte) int
// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 511 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBlockAsm8B(dst []byte, src []byte) int
// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4294967295 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBetterBlockAsm(dst []byte, src []byte) int
// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4194304 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 16383 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBetterBlockAsm12B(dst []byte, src []byte) int
// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4095 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBetterBlockAsm10B(dst []byte, src []byte) int
// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 511 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeBetterBlockAsm8B(dst []byte, src []byte) int
// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4294967295 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBlockAsm(dst []byte, src []byte) int
// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 65535 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 16383 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4095 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 511 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4294967295 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 65535 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 16383 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4095 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 511 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
//
//go:noescape
func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes with margin of 0 bytes
// 0 <= len(lit) && len(lit) <= math.MaxUint32
//
//go:noescape
func emitLiteral(dst []byte, lit []byte) int
// emitRepeat writes a repeat chunk and returns the number of bytes written.
// Length must be at least 4 and < 1<<32
//
//go:noescape
func emitRepeat(dst []byte, offset int, length int) int
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopy(dst []byte, offset int, length int) int
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopyNoRepeat(dst []byte, offset int, length int) int
// matchLen returns how many bytes match in a and b
//
// It assumes that:
// len(a) <= len(b)
//
//go:noescape
func matchLen(a []byte, b []byte) int

File diff suppressed because it is too large Load diff

598
vendor/github.com/klauspost/compress/s2/index.go generated vendored Normal file
View file

@ -0,0 +1,598 @@
// Copyright (c) 2022+ Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package s2
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"sort"
)
const (
S2IndexHeader = "s2idx\x00"
S2IndexTrailer = "\x00xdi2s"
maxIndexEntries = 1 << 16
)
// Index represents an S2/Snappy index.
type Index struct {
TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
TotalCompressed int64 // Total Compressed size if known. Will be -1 if unknown.
info []struct {
compressedOffset int64
uncompressedOffset int64
}
estBlockUncomp int64
}
func (i *Index) reset(maxBlock int) {
i.estBlockUncomp = int64(maxBlock)
i.TotalCompressed = -1
i.TotalUncompressed = -1
if len(i.info) > 0 {
i.info = i.info[:0]
}
}
// allocInfos will allocate an empty slice of infos.
func (i *Index) allocInfos(n int) {
if n > maxIndexEntries {
panic("n > maxIndexEntries")
}
i.info = make([]struct {
compressedOffset int64
uncompressedOffset int64
}, 0, n)
}
// add an uncompressed and compressed pair.
// Entries must be sent in order.
func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
if i == nil {
return nil
}
lastIdx := len(i.info) - 1
if lastIdx >= 0 {
latest := i.info[lastIdx]
if latest.uncompressedOffset == uncompressedOffset {
// Uncompressed didn't change, don't add entry,
// but update start index.
latest.compressedOffset = compressedOffset
i.info[lastIdx] = latest
return nil
}
if latest.uncompressedOffset > uncompressedOffset {
return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
}
if latest.compressedOffset > compressedOffset {
return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
}
}
i.info = append(i.info, struct {
compressedOffset int64
uncompressedOffset int64
}{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
return nil
}
// Find the offset at or before the wanted (uncompressed) offset.
// If offset is 0 or positive it is the offset from the beginning of the file.
// If the uncompressed size is known, the offset must be within the file.
// If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
// If the offset is negative, it is interpreted as the distance from the end of the file,
// where -1 represents the last byte.
// If offset from the end of the file is requested, but size is unknown,
// ErrUnsupported will be returned.
func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
if i.TotalUncompressed < 0 {
return 0, 0, ErrCorrupt
}
if offset < 0 {
offset = i.TotalUncompressed + offset
if offset < 0 {
return 0, 0, io.ErrUnexpectedEOF
}
}
if offset > i.TotalUncompressed {
return 0, 0, io.ErrUnexpectedEOF
}
if len(i.info) > 200 {
n := sort.Search(len(i.info), func(n int) bool {
return i.info[n].uncompressedOffset > offset
})
if n == 0 {
n = 1
}
return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
}
for _, info := range i.info {
if info.uncompressedOffset > offset {
break
}
compressedOff = info.compressedOffset
uncompressedOff = info.uncompressedOffset
}
return compressedOff, uncompressedOff, nil
}
// reduce to stay below maxIndexEntries
func (i *Index) reduce() {
if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
return
}
// Algorithm, keep 1, remove removeN entries...
removeN := (len(i.info) + 1) / maxIndexEntries
src := i.info
j := 0
// Each block should be at least 1MB, but don't reduce below 1000 entries.
for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
removeN++
}
for idx := 0; idx < len(src); idx++ {
i.info[j] = src[idx]
j++
idx += removeN
}
i.info = i.info[:j]
// Update maxblock estimate.
i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
}
func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
i.reduce()
var tmp [binary.MaxVarintLen64]byte
initSize := len(b)
// We make the start a skippable header+size.
b = append(b, ChunkTypeIndex, 0, 0, 0)
b = append(b, []byte(S2IndexHeader)...)
// Total Uncompressed size
n := binary.PutVarint(tmp[:], uncompTotal)
b = append(b, tmp[:n]...)
// Total Compressed size
n = binary.PutVarint(tmp[:], compTotal)
b = append(b, tmp[:n]...)
// Put EstBlockUncomp size
n = binary.PutVarint(tmp[:], i.estBlockUncomp)
b = append(b, tmp[:n]...)
// Put length
n = binary.PutVarint(tmp[:], int64(len(i.info)))
b = append(b, tmp[:n]...)
// Check if we should add uncompressed offsets
var hasUncompressed byte
for idx, info := range i.info {
if idx == 0 {
if info.uncompressedOffset != 0 {
hasUncompressed = 1
break
}
continue
}
if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
hasUncompressed = 1
break
}
}
b = append(b, hasUncompressed)
// Add each entry
if hasUncompressed == 1 {
for idx, info := range i.info {
uOff := info.uncompressedOffset
if idx > 0 {
prev := i.info[idx-1]
uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
}
n = binary.PutVarint(tmp[:], uOff)
b = append(b, tmp[:n]...)
}
}
// Initial compressed size estimate.
cPredict := i.estBlockUncomp / 2
for idx, info := range i.info {
cOff := info.compressedOffset
if idx > 0 {
prev := i.info[idx-1]
cOff -= prev.compressedOffset + cPredict
// Update compressed size prediction, with half the error.
cPredict += cOff / 2
}
n = binary.PutVarint(tmp[:], cOff)
b = append(b, tmp[:n]...)
}
// Add Total Size.
// Stored as fixed size for easier reading.
binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
b = append(b, tmp[:4]...)
// Trailer
b = append(b, []byte(S2IndexTrailer)...)
// Update size
chunkLen := len(b) - initSize - skippableFrameHeader
b[initSize+1] = uint8(chunkLen >> 0)
b[initSize+2] = uint8(chunkLen >> 8)
b[initSize+3] = uint8(chunkLen >> 16)
//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
return b
}
// Load a binary index.
// A zero value Index can be used or a previous one can be reused.
func (i *Index) Load(b []byte) ([]byte, error) {
if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
return b, io.ErrUnexpectedEOF
}
if b[0] != ChunkTypeIndex {
return b, ErrCorrupt
}
chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
b = b[4:]
// Validate we have enough...
if len(b) < chunkLen {
return b, io.ErrUnexpectedEOF
}
if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
return b, ErrUnsupported
}
b = b[len(S2IndexHeader):]
// Total Uncompressed
if v, n := binary.Varint(b); n <= 0 || v < 0 {
return b, ErrCorrupt
} else {
i.TotalUncompressed = v
b = b[n:]
}
// Total Compressed
if v, n := binary.Varint(b); n <= 0 {
return b, ErrCorrupt
} else {
i.TotalCompressed = v
b = b[n:]
}
// Read EstBlockUncomp
if v, n := binary.Varint(b); n <= 0 {
return b, ErrCorrupt
} else {
if v < 0 {
return b, ErrCorrupt
}
i.estBlockUncomp = v
b = b[n:]
}
var entries int
if v, n := binary.Varint(b); n <= 0 {
return b, ErrCorrupt
} else {
if v < 0 || v > maxIndexEntries {
return b, ErrCorrupt
}
entries = int(v)
b = b[n:]
}
if cap(i.info) < entries {
i.allocInfos(entries)
}
i.info = i.info[:entries]
if len(b) < 1 {
return b, io.ErrUnexpectedEOF
}
hasUncompressed := b[0]
b = b[1:]
if hasUncompressed&1 != hasUncompressed {
return b, ErrCorrupt
}
// Add each uncompressed entry
for idx := range i.info {
var uOff int64
if hasUncompressed != 0 {
// Load delta
if v, n := binary.Varint(b); n <= 0 {
return b, ErrCorrupt
} else {
uOff = v
b = b[n:]
}
}
if idx > 0 {
prev := i.info[idx-1].uncompressedOffset
uOff += prev + (i.estBlockUncomp)
if uOff <= prev {
return b, ErrCorrupt
}
}
if uOff < 0 {
return b, ErrCorrupt
}
i.info[idx].uncompressedOffset = uOff
}
// Initial compressed size estimate.
cPredict := i.estBlockUncomp / 2
// Add each compressed entry
for idx := range i.info {
var cOff int64
if v, n := binary.Varint(b); n <= 0 {
return b, ErrCorrupt
} else {
cOff = v
b = b[n:]
}
if idx > 0 {
// Update compressed size prediction, with half the error.
cPredictNew := cPredict + cOff/2
prev := i.info[idx-1].compressedOffset
cOff += prev + cPredict
if cOff <= prev {
return b, ErrCorrupt
}
cPredict = cPredictNew
}
if cOff < 0 {
return b, ErrCorrupt
}
i.info[idx].compressedOffset = cOff
}
if len(b) < 4+len(S2IndexTrailer) {
return b, io.ErrUnexpectedEOF
}
// Skip size...
b = b[4:]
// Check trailer...
if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
return b, ErrCorrupt
}
return b[len(S2IndexTrailer):], nil
}
// LoadStream will load an index from the end of the supplied stream.
// ErrUnsupported will be returned if the signature cannot be found.
// ErrCorrupt will be returned if unexpected values are found.
// io.ErrUnexpectedEOF is returned if there are too few bytes.
// IO errors are returned as-is.
func (i *Index) LoadStream(rs io.ReadSeeker) error {
// Go to end.
_, err := rs.Seek(-10, io.SeekEnd)
if err != nil {
return err
}
var tmp [10]byte
_, err = io.ReadFull(rs, tmp[:])
if err != nil {
return err
}
// Check trailer...
if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
return ErrUnsupported
}
sz := binary.LittleEndian.Uint32(tmp[:4])
if sz > maxChunkSize+skippableFrameHeader {
return ErrCorrupt
}
_, err = rs.Seek(-int64(sz), io.SeekEnd)
if err != nil {
return err
}
// Read index.
buf := make([]byte, sz)
_, err = io.ReadFull(rs, buf)
if err != nil {
return err
}
_, err = i.Load(buf)
return err
}
// IndexStream will return an index for a stream.
// The stream structure will be checked, but
// data within blocks is not verified.
// The returned index can either be appended to the end of the stream
// or stored separately.
func IndexStream(r io.Reader) ([]byte, error) {
var i Index
var buf [maxChunkSize]byte
var readHeader bool
for {
_, err := io.ReadFull(r, buf[:4])
if err != nil {
if err == io.EOF {
return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
}
return nil, err
}
// Start of this chunk.
startChunk := i.TotalCompressed
i.TotalCompressed += 4
chunkType := buf[0]
if !readHeader {
if chunkType != chunkTypeStreamIdentifier {
return nil, ErrCorrupt
}
readHeader = true
}
chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
if chunkLen < checksumSize {
return nil, ErrCorrupt
}
i.TotalCompressed += int64(chunkLen)
_, err = io.ReadFull(r, buf[:chunkLen])
if err != nil {
return nil, io.ErrUnexpectedEOF
}
// The chunk types are specified at
// https://github.com/google/snappy/blob/master/framing_format.txt
switch chunkType {
case chunkTypeCompressedData:
// Section 4.2. Compressed data (chunk type 0x00).
// Skip checksum.
dLen, err := DecodedLen(buf[checksumSize:])
if err != nil {
return nil, err
}
if dLen > maxBlockSize {
return nil, ErrCorrupt
}
if i.estBlockUncomp == 0 {
// Use first block for estimate...
i.estBlockUncomp = int64(dLen)
}
err = i.add(startChunk, i.TotalUncompressed)
if err != nil {
return nil, err
}
i.TotalUncompressed += int64(dLen)
continue
case chunkTypeUncompressedData:
n2 := chunkLen - checksumSize
if n2 > maxBlockSize {
return nil, ErrCorrupt
}
if i.estBlockUncomp == 0 {
// Use first block for estimate...
i.estBlockUncomp = int64(n2)
}
err = i.add(startChunk, i.TotalUncompressed)
if err != nil {
return nil, err
}
i.TotalUncompressed += int64(n2)
continue
case chunkTypeStreamIdentifier:
// Section 4.1. Stream identifier (chunk type 0xff).
if chunkLen != len(magicBody) {
return nil, ErrCorrupt
}
if string(buf[:len(magicBody)]) != magicBody {
if string(buf[:len(magicBody)]) != magicBodySnappy {
return nil, ErrCorrupt
}
}
continue
}
if chunkType <= 0x7f {
// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
return nil, ErrUnsupported
}
if chunkLen > maxChunkSize {
return nil, ErrUnsupported
}
// Section 4.4 Padding (chunk type 0xfe).
// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
}
}
// JSON returns the index as JSON text.
func (i *Index) JSON() []byte {
x := struct {
TotalUncompressed int64 `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
TotalCompressed int64 `json:"total_compressed"` // Total Compressed size if known. Will be -1 if unknown.
Offsets []struct {
CompressedOffset int64 `json:"compressed"`
UncompressedOffset int64 `json:"uncompressed"`
} `json:"offsets"`
EstBlockUncomp int64 `json:"est_block_uncompressed"`
}{
TotalUncompressed: i.TotalUncompressed,
TotalCompressed: i.TotalCompressed,
EstBlockUncomp: i.estBlockUncomp,
}
for _, v := range i.info {
x.Offsets = append(x.Offsets, struct {
CompressedOffset int64 `json:"compressed"`
UncompressedOffset int64 `json:"uncompressed"`
}{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
}
b, _ := json.MarshalIndent(x, "", " ")
return b
}
// RemoveIndexHeaders will trim all headers and trailers from a given index.
// This is expected to save 20 bytes.
// These can be restored using RestoreIndexHeaders.
// This removes a layer of security, but is the most compact representation.
// Returns nil if headers contains errors.
// The returned slice references the provided slice.
func RemoveIndexHeaders(b []byte) []byte {
const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
if len(b) <= save {
return nil
}
if b[0] != ChunkTypeIndex {
return nil
}
chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
b = b[4:]
// Validate we have enough...
if len(b) < chunkLen {
return nil
}
b = b[:chunkLen]
if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
return nil
}
b = b[len(S2IndexHeader):]
if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
return nil
}
b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
if len(b) < 4 {
return nil
}
return b[:len(b)-4]
}
// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
// No error checking is performed on the input.
// If a 0 length slice is sent, it is returned without modification.
func RestoreIndexHeaders(in []byte) []byte {
if len(in) == 0 {
return in
}
b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
b = append(b, ChunkTypeIndex, 0, 0, 0)
b = append(b, []byte(S2IndexHeader)...)
b = append(b, in...)
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
b = append(b, tmp[:4]...)
// Trailer
b = append(b, []byte(S2IndexTrailer)...)
chunkLen := len(b) - skippableFrameHeader
b[1] = uint8(chunkLen >> 0)
b[2] = uint8(chunkLen >> 8)
b[3] = uint8(chunkLen >> 16)
return b
}

143
vendor/github.com/klauspost/compress/s2/s2.go generated vendored Normal file
View file

@ -0,0 +1,143 @@
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Copyright (c) 2019 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package s2 implements the S2 compression format.
//
// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
// which is why it features concurrent compression for bigger payloads.
//
// Decoding is compatible with Snappy compressed content,
// but content compressed with S2 cannot be decompressed by Snappy.
//
// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
//
// There are actually two S2 formats: block and stream. They are related,
// but different: trying to decompress block-compressed data as a S2 stream
// will fail, and vice versa. The block format is the Decode and Encode
// functions and the stream format is the Reader and Writer types.
//
// A "better" compression option is available. This will trade some compression
// speed
//
// The block format, the more common case, is used when the complete size (the
// number of bytes) of the original data is known upfront, at the time
// compression starts. The stream format, also known as the framing format, is
// for when that isn't always true.
//
// Blocks to not offer much data protection, so it is up to you to
// add data validation of decompressed blocks.
//
// Streams perform CRC validation of the decompressed data.
// Stream compression will also be performed on multiple CPU cores concurrently
// significantly improving throughput.
package s2
import (
"bytes"
"hash/crc32"
)
/*
Each encoded block begins with the varint-encoded length of the decoded data,
followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
first byte of each chunk is broken into its 2 least and 6 most significant bits
called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
Zero means a literal tag. All other values mean a copy tag.
For literal tags:
- If m < 60, the next 1 + m bytes are literal bytes.
- Otherwise, let n be the little-endian unsigned integer denoted by the next
m - 59 bytes. The next 1 + n bytes after that are literal bytes.
For copy tags, length bytes are copied from offset bytes ago, in the style of
Lempel-Ziv compression algorithms. In particular:
- For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
of the offset. The next byte is bits 0-7 of the offset.
- For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
The length is 1 + m. The offset is the little-endian unsigned integer
denoted by the next 2 bytes.
- For l == 3, the offset ranges in [0, 1<<32) and the length in
[1, 65). The length is 1 + m. The offset is the little-endian unsigned
integer denoted by the next 4 bytes.
*/
const (
tagLiteral = 0x00
tagCopy1 = 0x01
tagCopy2 = 0x02
tagCopy4 = 0x03
)
const (
checksumSize = 4
chunkHeaderSize = 4
magicChunk = "\xff\x06\x00\x00" + magicBody
magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
magicBodySnappy = "sNaPpY"
magicBody = "S2sTwO"
// maxBlockSize is the maximum size of the input to encodeBlock.
//
// For the framing format (Writer type instead of Encode function),
// this is the maximum uncompressed size of a block.
maxBlockSize = 4 << 20
// minBlockSize is the minimum size of block setting when creating a writer.
minBlockSize = 4 << 10
skippableFrameHeader = 4
maxChunkSize = 1<<24 - 1 // 16777215
// Default block size
defaultBlockSize = 1 << 20
// maxSnappyBlockSize is the maximum snappy block size.
maxSnappyBlockSize = 1 << 16
obufHeaderLen = checksumSize + chunkHeaderSize
)
const (
chunkTypeCompressedData = 0x00
chunkTypeUncompressedData = 0x01
ChunkTypeIndex = 0x99
chunkTypePadding = 0xfe
chunkTypeStreamIdentifier = 0xff
)
var crcTable = crc32.MakeTable(crc32.Castagnoli)
// crc implements the checksum specified in section 3 of
// https://github.com/google/snappy/blob/master/framing_format.txt
func crc(b []byte) uint32 {
c := crc32.Update(0, crcTable, b)
return c>>15 | c<<17 + 0xa282ead8
}
// literalExtraSize returns the extra size of encoding n literals.
// n should be >= 0 and <= math.MaxUint32.
func literalExtraSize(n int64) int64 {
if n == 0 {
return 0
}
switch {
case n < 60:
return 1
case n < 1<<8:
return 2
case n < 1<<16:
return 3
case n < 1<<24:
return 4
default:
return 5
}
}
type byter interface {
Bytes() []byte
}
var _ byter = &bytes.Buffer{}

3
vendor/modules.txt vendored
View file

@ -16,7 +16,7 @@ cloud.google.com/go/storage
cloud.google.com/go/storage/internal
cloud.google.com/go/storage/internal/apiv2
cloud.google.com/go/storage/internal/apiv2/stubs
# github.com/VictoriaMetrics/fastcache v1.10.0
# github.com/VictoriaMetrics/fastcache v1.12.0
## explicit; go 1.13
github.com/VictoriaMetrics/fastcache
# github.com/VictoriaMetrics/fasthttp v1.1.0
@ -166,6 +166,7 @@ github.com/klauspost/compress/gzip
github.com/klauspost/compress/huff0
github.com/klauspost/compress/internal/cpuinfo
github.com/klauspost/compress/internal/snapref
github.com/klauspost/compress/s2
github.com/klauspost/compress/zlib
github.com/klauspost/compress/zstd
github.com/klauspost/compress/zstd/internal/xxhash