vendor: update github.com/klauspost/compress from v1.10.7 to v1.10.8

2024-11-21 14:44:00 +00:00 · 2020-06-05 23:51:30 +03:00 · 2020-06-05 23:51:30 +03:00 · 2382053d32
commit 2382053d32
parent 69a647b0d2
13 changed files with 1034 additions and 148 deletions
--- a/go.mod
+++ b/go.mod
@ -13,7 +13,7 @@ require (
 	github.com/cespare/xxhash/v2 v2.1.1
 	github.com/golang/protobuf v1.4.2 // indirect
 	github.com/golang/snappy v0.0.1
-	github.com/klauspost/compress v1.10.7
+	github.com/klauspost/compress v1.10.8
 	github.com/valyala/fastjson v1.5.1
 	github.com/valyala/fastrand v1.0.0
 	github.com/valyala/gozstd v1.7.0
--- a/go.sum
+++ b/go.sum
@ -132,8 +132,8 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
 github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/compress v1.10.5 h1:7q6vHIqubShURwQz8cQK6yIe/xC3IF0Vm7TGfqjewrc=
 github.com/klauspost/compress v1.10.5/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
-github.com/klauspost/compress v1.10.7 h1:7rix8v8GpI3ZBb0nSozFRgbtXKv+hOe+qfEpZqybrAg=
+github.com/klauspost/compress v1.10.8 h1:eLeJ3dr/Y9+XRfJT4l+8ZjmtB5RPJhucH2HeCV5+IZY=
-github.com/klauspost/compress v1.10.7/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.10.8/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
--- a/vendor/github.com/klauspost/compress/fse/bitreader.go
+++ b/vendor/github.com/klauspost/compress/fse/bitreader.go
@ -6,6 +6,7 @@
 package fse
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@ -63,8 +68,9 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
+	v := b.in[b.off-4:]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
@ -77,7 +83,8 @@ func (b *bitReader) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@ -91,9 +98,17 @@ func (b *bitReader) fill() {
 	}
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.bitsRead >= 64 && b.off == 0
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
--- a/vendor/github.com/klauspost/compress/fse/bytereader.go
+++ b/vendor/github.com/klauspost/compress/fse/bytereader.go
@ -25,19 +25,10 @@ func (b *byteReader) advance(n uint) {
 	b.off += int(n)
 }
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
 	b2 := b.b[b.off : b.off+4 : b.off+4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
 	v0 := int32(b2[0])
 	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 // Uint32 returns a little endian uint32 starting at current offset.
 func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@ -12,8 +12,6 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 * [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
 THIS PACKAGE IS NOT CONSIDERED STABLE AND API OR ENCODING MAY CHANGE IN THE FUTURE.
 ## News
 * Mar 2018: First implementation released. Consider this beta software for now.
@ -75,6 +73,8 @@ which can be given to the decompressor.
 Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
 or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
 For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
 You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
 your input was likely corrupted. 
@ -84,4 +84,4 @@ There are no integrity checks, so relying on errors from the decompressor does n
 # Contributing
 Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
+changes will likely not be accepted. If in doubt open an issue before writing the PR.
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@ -6,6 +6,7 @@
 package huff0
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
@ -34,29 +35,16 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
 	return nil
 }
 // getBits will return n bits. n can be 0.
 func (b *bitReader) getBits(n uint8) uint16 {
 	if n == 0 || b.bitsRead >= 64 {
 		return 0
 	}
 	return b.getBitsFast(n)
 }
 // getBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) getBitsFast(n uint8) uint16 {
 	const regMask = 64 - 1
 	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
 	return v
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) peekBitsFast(n uint8) uint16 {
@ -71,21 +59,36 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 func (b *bitReader) advance(n uint8) {
 	b.bitsRead += n
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@ -113,3 +116,214 @@ func (b *bitReader) close() error {
 	}
 	return nil
 }
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
 type bitReaderBytes struct {
 	in       []byte
 	off      uint // next byte to read is at in[off - 1]
 	value    uint64
 	bitsRead uint8
 }
 // init initializes and resets the bit reader.
 func (b *bitReaderBytes) init(in []byte) error {
 	if len(in) < 1 {
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
 	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
 		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.advance(8 - uint8(highBit32(uint32(v))))
 	return nil
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderBytes) peekByteFast() uint8 {
 	got := uint8(b.value >> 56)
 	return got
 }
 func (b *bitReaderBytes) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
 }
 // fillFast() will make sure at least 32 bits are available.
 // There must be at least 4 bytes available.
 func (b *bitReaderBytes) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
 func (b *bitReaderBytes) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReaderBytes) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
 		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
 		b.off -= 4
 		return
 	}
 	for b.off > 0 {
 		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
 		b.bitsRead -= 8
 		b.off--
 	}
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
 	return nil
 }
 // bitReaderShifted reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
 type bitReaderShifted struct {
 	in       []byte
 	off      uint // next byte to read is at in[off - 1]
 	value    uint64
 	bitsRead uint8
 }
 // init initializes and resets the bit reader.
 func (b *bitReaderShifted) init(in []byte) error {
 	if len(in) < 1 {
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
 	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
 		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.advance(8 - uint8(highBit32(uint32(v))))
 	return nil
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
 }
 // fillFast() will make sure at least 32 bits are available.
 // There must be at least 4 bytes available.
 func (b *bitReaderShifted) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
 func (b *bitReaderShifted) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReaderShifted) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
 		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
 		b.off -= 4
 		return
 	}
 	for b.off > 0 {
 		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
 		b.bitsRead -= 8
 		b.off--
 	}
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
 	return nil
 }
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@ -25,6 +25,9 @@ type dEntryDouble struct {
 	len   uint8
 }
 // Uses special code for all tables that are < 8 bits.
 const use8BitTables = true
 // ReadTable will read a table from the input.
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
@ -83,6 +86,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		}
 		v2 := v & 15
 		rankStats[v2]++
 		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
 		weightTotal += (1 << v2) >> 1
 	}
 	if weightTotal == 0 {
@ -142,12 +146,14 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
 		}
-		single := s.dt.single[rankStats[w] : rankStats[w]+length]
+		rank := &rankStats[w]
 		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
 		}
-		rankStats[w] += length
+		*rank += length
 	}
 	return s, in, nil
 }
@ -208,7 +214,10 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
-	var br bitReader
+	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress1X8Bit(dst, src)
 	}
 	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
@ -216,17 +225,6 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 	decode := func() byte {
 		val := br.peekBitsFast(d.actualTableLog) /* note : actualTableLog >= 1 */
 		v := d.dt.single[val]
 		br.bitsRead += uint8(v.entry)
 		return uint8(v.entry >> 8)
 	}
 	hasDec := func(v dEntrySingle) byte {
 		br.bitsRead += uint8(v.entry)
 		return uint8(v.entry >> 8)
 	}
 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
@ -238,11 +236,25 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	for br.off >= 8 {
 		br.fillFast()
-		buf[off+0] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		buf[off+1] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 		// Refill
 		br.fillFast()
-		buf[off+2] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+
-		buf[off+3] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
@ -259,13 +271,196 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	}
 	dst = append(dst, buf[:off]...)
-	for !br.finished() {
+	// br < 8, so uint8 is fine
 	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
 	for bitsLeft > 0 {
 		br.fill()
 		if false && br.bitsRead >= 32 {
 			if br.off >= 4 {
 				v := br.in[br.off-4:]
 				v = v[:4]
 				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 				br.value = (br.value << 32) | uint64(low)
 				br.bitsRead -= 32
 				br.off -= 4
 			} else {
 				for br.off > 0 {
 					br.value = (br.value << 8) | uint64(br.in[br.off-1])
 					br.bitsRead -= 8
 					br.off--
 				}
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		dst = append(dst, decode())
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress1X8BitExactly(dst, src)
 	}
 	var br bitReaderBytes
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 	// Avoid bounds check by always having full sized table.
 	dt := d.dt.single[:256]
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	shift := (8 - d.actualTableLog) & 7
 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
 		v := dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}
 	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)
 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
 				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 				br.bitsRead -= 8
 				br.off--
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	var br bitReaderBytes
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 	// Avoid bounds check by always having full sized table.
 	dt := d.dt.single[:256]
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	const shift = 0
 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
 		v := dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}
 	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)
 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
 				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 				br.bitsRead -= 8
 				br.off--
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }
@ -274,15 +469,18 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
-func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(s.dt.single) == 0 {
+	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if len(src) < 6+(4*1) {
 		return nil, errors.New("input too small")
 	}
 	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress4X8bit(dst, src)
 	}
-	var br [4]bitReader
+	var br [4]bitReaderShifted
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
@ -308,14 +506,7 @@ func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
-	single := s.dt.single[:tlSize]
+	single := d.dt.single[:tlSize]
 	decode := func(br *bitReader) byte {
 		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
 		v := single[val&tlMask]
 		br.bitsRead += uint8(v.entry)
 		return uint8(v.entry >> 8)
 	}
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
@ -324,66 +515,63 @@ func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	// Decode 2 values from each decoder/loop.
 	const bufoff = 256 / 4
 bigloop:
 	for {
-		for i := range br {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			br := &br[i]
+			break
 			if br.off < 4 {
 				break bigloop
 			}
 			br.fillFast()
 		}
 		{
 			const stream = 0
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
 			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
 			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
-		{
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			const stream = 1
 			val := br[stream].peekBitsFast(s.actualTableLog)
 			v := single[val&tlMask]
 			br[stream].bitsRead += uint8(v.entry)
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+
 			val = br[stream].peekBitsFast(d.actualTableLog)
 			v = single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
 			val2 = br[stream2].peekBitsFast(d.actualTableLog)
 			v2 = single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}
 		{
 			const stream = 2
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
 			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
 			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
-		{
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			const stream = 3
 			val := br[stream].peekBitsFast(s.actualTableLog)
 			v := single[val&tlMask]
 			br[stream].bitsRead += uint8(v.entry)
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+
 			val = br[stream].peekBitsFast(d.actualTableLog)
 			v = single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
 			val2 = br[stream2].peekBitsFast(d.actualTableLog)
 			v2 = single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}
 		off += 2
@ -422,12 +610,456 @@ bigloop:
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
-		for !br.finished() {
+		bitsLeft := br.off*8 + uint(64-br.bitsRead)
 		for bitsLeft > 0 {
 			br.fill()
 			if false && br.bitsRead >= 32 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value = (br.value << 32) | uint64(low)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value = (br.value << 8) | uint64(br.in[br.off-1])
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
-			out[offset] = decode(br)
+
 			// Read value and increment offset.
 			val := br.peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
 			return nil, err
 		}
 	}
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
 func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress4X8bitExactly(dst, src)
 	}
 	var br [4]bitReaderBytes
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
 		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
 		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
 	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}
 	// destination, offset to match first output
 	dstSize := cap(dst)
 	dst = dst[:dstSize]
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 	shift := (8 - d.actualTableLog) & 7
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	var decoded int
 	// Decode 4 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}
 		{
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}
 		{
 			const stream = 2
 			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}
 		off += 4
 		if off == bufoff {
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(out, buf[:bufoff])
 			copy(out[dstEvery:], buf[bufoff:bufoff*2])
 			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
 			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
 			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[:off])
 		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
 		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
 		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
 		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
 		for bitsLeft > 0 {
 			if br.finished() {
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value |= uint64(low) << (br.bitsRead - 32)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 			// Read value and increment offset.
 			v := single[br.peekByteFast()>>shift].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= int(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
 			return nil, err
 		}
 	}
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
 func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	var br [4]bitReaderBytes
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
 		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
 		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
 	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}
 	// destination, offset to match first output
 	dstSize := cap(dst)
 	dst = dst[:dstSize]
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 	const shift = 0
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	var decoded int
 	// Decode 4 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}
 		{
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}
 		{
 			const stream = 2
 			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()
 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))
 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}
 		off += 4
 		if off == bufoff {
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(out, buf[:bufoff])
 			copy(out[dstEvery:], buf[bufoff:bufoff*2])
 			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
 			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
 			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[:off])
 		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
 		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
 		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
 		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
 		for bitsLeft > 0 {
 			if br.finished() {
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value |= uint64(low) << (br.bitsRead - 32)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 			// Read value and increment offset.
 			v := single[br.peekByteFast()>>shift].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= int(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@ -5,6 +5,7 @@
 package zstd
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 	"math/bits"
@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@ -63,21 +68,31 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
+	v := b.in[b.off-4:]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off >= 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@ -83,6 +83,10 @@ type blockDec struct {
 	err         error
 	decWG       sync.WaitGroup
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@ -4,8 +4,6 @@
 package zstd
 import "encoding/binary"
 // byteReader provides a byte reader that reads
 // little endian values from a byte stream.
 // The input stream is manually advanced.
@ -33,7 +31,8 @@ func (b *byteReader) overread() bool {
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
@ -57,7 +56,25 @@ func (b byteReader) Uint32() uint32 {
 		}
 		return v
 	}
-	return binary.LittleEndian.Uint32(b.b[b.off : b.off+4])
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
 	v0 := uint32(b2[0])
 	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 // Uint32NC returns a little endian uint32 starting at current offset.
 // The caller must be sure if there are at least 4 bytes left.
 func (b byteReader) Uint32NC() uint32 {
 	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
 	v0 := uint32(b2[0])
 	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 // unread returns the unread portion of the input.
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@ -23,9 +23,6 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 	// Unreferenced decoders, ready for use.
 	frames chan *frameDec
 	// Streams ready to be decoded.
 	stream chan decodeStream
@ -90,10 +87,10 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	// Create decoders
 	d.decoders = make(chan *blockDec, d.o.concurrent)
 	d.frames = make(chan *frameDec, d.o.concurrent)
 	for i := 0; i < d.o.concurrent; i++ {
-		d.frames <- newFrameDec(d.o)
+		dec := newBlockDec(d.o.lowMem)
-		d.decoders <- newBlockDec(d.o.lowMem)
+		dec.localFrame = newFrameDec(d.o)
 		d.decoders <- dec
 	}
 	if r == nil {
@ -283,15 +280,15 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	}
 	// Grab a block decoder and frame decoder.
-	block, frame := <-d.decoders, <-d.frames
+	block := <-d.decoders
 	frame := block.localFrame
 	defer func() {
 		if debug {
 			printf("re-adding decoder: %p", block)
 		}
 		d.decoders <- block
 		frame.rawInput = nil
 		frame.bBuf = nil
-		d.frames <- frame
+		d.decoders <- block
 	}()
 	frame.bBuf = input
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@ -55,7 +55,7 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	if b.remain() < 4 {
 		return errors.New("input too small")
 	}
-	bitStream := b.Uint32()
+	bitStream := b.Uint32NC()
 	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
 	if nbBits > tablelogAbsoluteMax {
 		println("Invalid tablelog:", nbBits)
@ -79,7 +79,8 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				n0 += 24
 				if r := b.remain(); r > 5 {
 					b.advance(2)
-					bitStream = b.Uint32() >> bitCount
+					// The check above should make sure we can read 32 bits
 					bitStream = b.Uint32NC() >> bitCount
 				} else {
 					// end of bit stream
 					bitStream >>= 16
@ -104,10 +105,11 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				charnum++
 			}
-			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 				b.advance(bitCount >> 3)
 				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
+				// The check above should make sure we can read 32 bits
 				bitStream = b.Uint32NC() >> bitCount
 			} else {
 				bitStream >>= 2
 			}
@ -148,17 +150,16 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 			threshold >>= 1
 		}
-		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
+		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
 			// The check above should make sure we can read 32 bits
 			bitStream = b.Uint32NC() >> (bitCount & 31)
 		} else {
 			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
 			b.off = len(b.b) - 4
-			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+			bitStream = b.Uint32() >> (bitCount & 31)
 		}
 		bitStream = b.Uint32() >> (bitCount & 31)
 		//printf("bitstream is now: 0b%b", bitStream)
 	}
 	s.symbolLen = charnum
 	if s.symbolLen <= 1 {
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -89,7 +89,7 @@ github.com/jmespath/go-jmespath
 github.com/jstemmer/go-junit-report
 github.com/jstemmer/go-junit-report/formatter
 github.com/jstemmer/go-junit-report/parser
-# github.com/klauspost/compress v1.10.7
+# github.com/klauspost/compress v1.10.8
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/gzip