diff --git a/go.mod b/go.mod
index 2bd4c7c38..ec4dc3ccc 100644
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
 	github.com/fsnotify/fsnotify v1.4.9 // indirect
 	github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c
 	github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e
-	github.com/honeycombio/libhoney-go v1.15.2
+	github.com/honeycombio/libhoney-go v1.15.3
 	github.com/kr/pty v1.1.8 // indirect
 	github.com/kyokomi/emoji/v2 v2.2.8
 	github.com/mitchellh/go-homedir v1.1.0
diff --git a/go.sum b/go.sum
index f255b68d7..1075327ae 100644
--- a/go.sum
+++ b/go.sum
@@ -16,8 +16,8 @@ github.com/AlecAivazis/survey/v2 v2.2.12/go.mod h1:6d4saEvBsfSHXeN1a5OA5m2+HJ2Lu
 github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
-github.com/DataDog/zstd v1.4.5 h1:EndNeuB0l9syBZhut0wns3gV1hL8zX8LIu6ZiVHWLIQ=
-github.com/DataDog/zstd v1.4.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
+github.com/DataDog/zstd v1.4.8 h1:Rpmta4xZ/MgZnriKNd24iZMhGpP5dvUcs/uqfBapKZY=
+github.com/DataDog/zstd v1.4.8/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
 github.com/Netflix/go-expect v0.0.0-20180615182759-c93bf25de8e8/go.mod h1:oX5x61PbNXchhh0oikYAH+4Pcfw5LKv21+Jnpr6r6Pc=
 github.com/Netflix/go-expect v0.0.0-20200312175327-da48e75238e2 h1:y2avNRjCeJT8b7svzjhKZjsvW5Jki/iAqTBEPJURaUg=
 github.com/Netflix/go-expect v0.0.0-20200312175327-da48e75238e2/go.mod h1:oX5x61PbNXchhh0oikYAH+4Pcfw5LKv21+Jnpr6r6Pc=
@@ -86,6 +86,8 @@ github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
 github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
 github.com/golang/protobuf v1.3.5 h1:F768QJ1E9tib+q5Sc8MkdJi1RxLTbRcTf8LJV56aRls=
 github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
+github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
+github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
@@ -128,8 +130,8 @@ github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c h1:kp3AxgXgDOmIJFR7b
 github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c/go.mod h1:DqJ97dSdRW1W22yXSB90986pcOyQ7r45iio1KN2ez1A=
 github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e h1:0aewS5NTyxftZHSnFaJmWE5oCCrj4DyEXkAiMa1iZJM=
 github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e/go.mod h1:pFlLw2CfqZiIBOx6BuCeRLCrfxBJipTY0nIOF/VbGcI=
-github.com/honeycombio/libhoney-go v1.15.2 h1:5NGcjOxZZma13dmzNcl3OtGbF1hECA0XHJNHEb2t2ck=
-github.com/honeycombio/libhoney-go v1.15.2/go.mod h1:JzhRPYgoBCd0rZvudrqmej4Ntx0w7AT3wAJpf5+t1WA=
+github.com/honeycombio/libhoney-go v1.15.3 h1:/i2u+MsfAKX8arTKhflprtjqXs+qaODRKmAJUk51qQw=
+github.com/honeycombio/libhoney-go v1.15.3/go.mod h1:SMz4WFtnFt0WxkBfJRLju/3UueS9V6pePG9KoI5c7rg=
 github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=
 github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
 github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
@@ -142,8 +144,8 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNU
 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
 github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU=
-github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.12.2 h1:2KCfW3I9M7nSc5wOqXAlW2v2U6v+w6cbjvbfp+OykW8=
+github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/golang/snappy/.gitignore
similarity index 100%
rename from vendor/github.com/klauspost/compress/snappy/.gitignore
rename to vendor/github.com/golang/snappy/.gitignore
diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/golang/snappy/AUTHORS
similarity index 90%
rename from vendor/github.com/klauspost/compress/snappy/AUTHORS
rename to vendor/github.com/golang/snappy/AUTHORS
index bcfa19520..203e84eba 100644
--- a/vendor/github.com/klauspost/compress/snappy/AUTHORS
+++ b/vendor/github.com/golang/snappy/AUTHORS
@@ -8,8 +8,10 @@
 
 # Please keep the list sorted.
 
+Amazon.com, Inc
 Damian Gryski <dgryski@gmail.com>
 Google Inc.
 Jan Mercl <0xjnml@gmail.com>
+Klaus Post <klauspost@gmail.com>
 Rodolfo Carvalho <rhcarvalho@gmail.com>
 Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/golang/snappy/CONTRIBUTORS
similarity index 95%
rename from vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
rename to vendor/github.com/golang/snappy/CONTRIBUTORS
index 931ae3160..d9914732b 100644
--- a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
+++ b/vendor/github.com/golang/snappy/CONTRIBUTORS
@@ -28,7 +28,9 @@
 
 Damian Gryski <dgryski@gmail.com>
 Jan Mercl <0xjnml@gmail.com>
+Jonathan Swinney <jswinney@amazon.com>
 Kai Backman <kaib@golang.org>
+Klaus Post <klauspost@gmail.com>
 Marc-Antoine Ruel <maruel@chromium.org>
 Nigel Tao <nigeltao@golang.org>
 Rob Pike <r@golang.org>
diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/golang/snappy/LICENSE
similarity index 100%
rename from vendor/github.com/klauspost/compress/snappy/LICENSE
rename to vendor/github.com/golang/snappy/LICENSE
diff --git a/vendor/github.com/klauspost/compress/snappy/README b/vendor/github.com/golang/snappy/README
similarity index 100%
rename from vendor/github.com/klauspost/compress/snappy/README
rename to vendor/github.com/golang/snappy/README
diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/golang/snappy/decode.go
similarity index 97%
rename from vendor/github.com/klauspost/compress/snappy/decode.go
rename to vendor/github.com/golang/snappy/decode.go
index 72efb0353..f1e04b172 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode.go
+++ b/vendor/github.com/golang/snappy/decode.go
@@ -52,6 +52,8 @@ const (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Decode handles the Snappy block format, not the Snappy stream format.
 func Decode(dst, src []byte) ([]byte, error) {
 	dLen, s, err := decodedLen(src)
 	if err != nil {
@@ -83,6 +85,8 @@ func NewReader(r io.Reader) *Reader {
 }
 
 // Reader is an io.Reader that can read Snappy-compressed bytes.
+//
+// Reader handles the Snappy stream format, not the Snappy block format.
 type Reader struct {
 	r       io.Reader
 	err     error
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/golang/snappy/decode_amd64.s
similarity index 98%
rename from vendor/github.com/klauspost/compress/snappy/decode_amd64.s
rename to vendor/github.com/golang/snappy/decode_amd64.s
index 1c66e3723..e6179f65e 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ b/vendor/github.com/golang/snappy/decode_amd64.s
@@ -184,7 +184,9 @@ tagLit60Plus:
 	// checks. In the asm version, we code it once instead of once per switch case.
 	ADDQ CX, SI
 	SUBQ $58, SI
-	CMPQ SI, R13
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
 	JA   errCorrupt
 
 	// case x == 60:
@@ -230,7 +232,9 @@ tagCopy4:
 	ADDQ $5, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-5])>>2
@@ -247,7 +251,9 @@ tagCopy2:
 	ADDQ $3, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-3])>>2
@@ -271,7 +277,9 @@ tagCopy:
 	ADDQ $2, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
 	JA   errCorrupt
 
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
diff --git a/vendor/github.com/golang/snappy/decode_arm64.s b/vendor/github.com/golang/snappy/decode_arm64.s
new file mode 100644
index 000000000..7a3ead17e
--- /dev/null
+++ b/vendor/github.com/golang/snappy/decode_arm64.s
@@ -0,0 +1,494 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R2	scratch
+//	- R3	scratch
+//	- R4	length or x
+//	- R5	offset
+//	- R6	&src[s]
+//	- R7	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R7 - R8,  and len(dst)-d is R10 - R7.
+// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
+TEXT ·decode(SB), NOSPLIT, $56-56
+	// Initialize R6, R7 and R8-R13.
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R7
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R6
+	MOVD R11, R13
+	ADD  R12, R13, R13
+
+loop:
+	// for s < len(src)
+	CMP R13, R6
+	BEQ end
+
+	// R4 = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBU (R6), R4
+	MOVW  R4, R3
+	ANDW  $3, R3
+	MOVW  $1, R1
+	CMPW  R1, R3
+	BGE   tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	MOVW $60, R1
+	LSRW $2, R4, R4
+	CMPW R4, R1
+	BLS  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	ADD $1, R6, R6
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R4 == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R4 can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R4, R4
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R2 = len(dst) - d
+	// R3 = len(src) - s
+	MOVD R10, R2
+	SUB  R7, R2, R2
+	MOVD R13, R3
+	SUB  R6, R3, R3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMP $16, R4
+	BGT callMemmove
+	CMP $16, R2
+	BLT callMemmove
+	CMP $16, R3
+	BLT callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R6), (R14, R15)
+	STP (R14, R15), 0(R7)
+
+	// d += length
+	// s += length
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMP R2, R4
+	BGT errCorrupt
+	CMP R3, R4
+	BGT errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVD R7, 8(RSP)
+	MOVD R6, 16(RSP)
+	MOVD R4, 24(RSP)
+	MOVD R7, 32(RSP)
+	MOVD R6, 40(RSP)
+	MOVD R4, 48(RSP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVD 32(RSP), R7
+	MOVD 40(RSP), R6
+	MOVD 48(RSP), R4
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R13
+	ADD  R12, R13, R13
+
+	// d += length
+	// s += length
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADD  R4, R6, R6
+	SUB  $58, R6, R6
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// case x == 60:
+	MOVW $61, R1
+	CMPW R1, R4
+	BEQ  tagLit61
+	BGT  tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBU -1(R6), R4
+	B     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVHU -2(R6), R4
+	B     doLit
+
+tagLit62Plus:
+	CMPW $62, R4
+	BHI  tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVHU -3(R6), R4
+	MOVBU -1(R6), R3
+	ORR   R3<<16, R4
+	B     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVWU -4(R6), R4
+	B     doLit
+
+	// The code above handles literal tags.
+	// ----------------------------------------
+	// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADD $5, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVWU -4(R6), R5
+	B     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADD $3, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVHU -2(R6), R5
+	B     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R3 == src[s] & 0x03
+	//	- R4 == src[s]
+	CMP $2, R3
+	BEQ tagCopy2
+	BGT tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADD $2, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVD  R4, R5
+	AND   $0xe0, R5
+	MOVBU -1(R6), R3
+	ORR   R5<<3, R3, R5
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVD $7, R1
+	AND  R4>>2, R1, R4
+	ADD  $4, R4, R4
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R4 == length && R4 > 0
+	//	- R5 == offset
+
+	// if offset <= 0 { etc }
+	MOVD $0, R1
+	CMP  R1, R5
+	BLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVD R7, R3
+	SUB  R8, R3, R3
+	CMP  R5, R3
+	BLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVD R10, R3
+	SUB  R7, R3, R3
+	CMP  R3, R4
+	BGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVD R10, R14
+	SUB  R7, R14, R14
+	MOVD R7, R15
+	SUB  R5, R15, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMP  $16, R4
+	BGT  slowForwardCopy
+	CMP  $8, R5
+	BLT  slowForwardCopy
+	CMP  $16, R14
+	BLT  slowForwardCopy
+	MOVD 0(R15), R2
+	MOVD R2, 0(R7)
+	MOVD 8(R15), R3
+	MOVD R3, 8(R7)
+	ADD  R4, R7, R7
+	B    loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUB $10, R14, R14
+	CMP R14, R4
+	BGT verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMP  $8, R5
+	BGE  fixUpSlowForwardCopy
+	MOVD (R15), R3
+	MOVD R3, (R7)
+	SUB  R5, R4, R4
+	ADD  R5, R7, R7
+	ADD  R5, R5, R5
+	B    makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R7 being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVD R7, R2
+	ADD  R4, R7, R7
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	MOVD $0, R1
+	CMP  R1, R4
+	BLE  loop
+	MOVD (R15), R3
+	MOVD R3, (R2)
+	ADD  $8, R15, R15
+	ADD  $8, R2, R2
+	SUB  $8, R4, R4
+	B    finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), R3
+	MOVB R3, (R7)
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	SUB  $1, R4, R4
+	CBNZ R4, verySlowForwardCopy
+	B    loop
+
+	// The code above handles copy tags.
+	// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMP R10, R7
+	BNE errCorrupt
+
+	// return 0
+	MOVD $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVD $1, R2
+	MOVD R2, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go b/vendor/github.com/golang/snappy/decode_asm.go
similarity index 93%
rename from vendor/github.com/klauspost/compress/snappy/decode_amd64.go
rename to vendor/github.com/golang/snappy/decode_asm.go
index fcd192b84..7082b3491 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
+++ b/vendor/github.com/golang/snappy/decode_asm.go
@@ -5,6 +5,7 @@
 // +build !appengine
 // +build gc
 // +build !noasm
+// +build amd64 arm64
 
 package snappy
 
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/golang/snappy/decode_other.go
similarity index 98%
rename from vendor/github.com/klauspost/compress/snappy/decode_other.go
rename to vendor/github.com/golang/snappy/decode_other.go
index 94a96c5d7..2f672be55 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ b/vendor/github.com/golang/snappy/decode_other.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
+// +build !amd64,!arm64 appengine !gc noasm
 
 package snappy
 
@@ -87,7 +87,7 @@ func decode(dst, src []byte) int {
 		}
 		// Copy from an earlier sub-slice of dst to a later sub-slice.
 		// If no overlap, use the built-in copy:
-		if offset > length {
+		if offset >= length {
 			copy(dst[d:d+length], dst[d-offset:])
 			d += length
 			continue
diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/golang/snappy/encode.go
similarity index 98%
rename from vendor/github.com/klauspost/compress/snappy/encode.go
rename to vendor/github.com/golang/snappy/encode.go
index 8d393e904..7f2365707 100644
--- a/vendor/github.com/klauspost/compress/snappy/encode.go
+++ b/vendor/github.com/golang/snappy/encode.go
@@ -15,6 +15,8 @@ import (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Encode handles the Snappy block format, not the Snappy stream format.
 func Encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
@@ -139,6 +141,8 @@ func NewBufferedWriter(w io.Writer) *Writer {
 }
 
 // Writer is an io.Writer that can write Snappy-compressed bytes.
+//
+// Writer handles the Snappy stream format, not the Snappy block format.
 type Writer struct {
 	w   io.Writer
 	err error
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s b/vendor/github.com/golang/snappy/encode_amd64.s
similarity index 100%
rename from vendor/github.com/klauspost/compress/snappy/encode_amd64.s
rename to vendor/github.com/golang/snappy/encode_amd64.s
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s
new file mode 100644
index 000000000..bf83667d7
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode_arm64.s
@@ -0,0 +1,722 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	len(lit)
+//	- R4	n
+//	- R6	return value
+//	- R8	&dst[i]
+//	- R10	&lit[0]
+//
+// The 32 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $32-56
+	MOVD dst_base+0(FP), R8
+	MOVD lit_base+24(FP), R10
+	MOVD lit_len+32(FP), R3
+	MOVD R3, R6
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	CMPW $60, R4
+	BLT  oneByte
+	CMPW $256, R4
+	BLT  twoBytes
+
+threeBytes:
+	MOVD $0xf4, R2
+	MOVB R2, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	ADD  $3, R6, R6
+	B    memmove
+
+twoBytes:
+	MOVD $0xf0, R2
+	MOVB R2, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	ADD  $2, R6, R6
+	B    memmove
+
+oneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+	ADD  $1, R6, R6
+
+memmove:
+	MOVD R6, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	length
+//	- R7	&dst[0]
+//	- R8	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVD dst_base+0(FP), R8
+	MOVD R8, R7
+	MOVD offset+24(FP), R11
+	MOVD length+32(FP), R3
+
+loop0:
+	// for length >= 68 { etc }
+	CMPW $68, R3
+	BLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $64, R3, R3
+	B    loop0
+
+step1:
+	// if length > 64 { etc }
+	CMP $64, R3
+	BLE step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $60, R3, R3
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMP  $12, R3
+	BGE  step3
+	CMPW $2048, R11
+	BGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $3, R11, R11
+	AND  $0xe0, R11, R11
+	SUB  $4, R3, R3
+	LSLW $2, R3
+	AND  $0xff, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUB  $1, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R6	&src[0]
+//	- R7	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVD src_base+0(FP), R6
+	MOVD src_len+8(FP), R14
+	MOVD i+24(FP), R15
+	MOVD j+32(FP), R7
+	ADD  R6, R14, R14
+	ADD  R6, R15, R15
+	ADD  R6, R7, R7
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  cmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  bsf
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  extendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  extendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- R3	.	.
+//	- R4	.	.
+//	- R5	64	shift
+//	- R6	72	&src[0], tableSize
+//	- R7	80	&src[s]
+//	- R8	88	&dst[d]
+//	- R9	96	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	104	prevHash, currHash, nextHash, offset
+//	- R12	112	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	120	candidate
+//	- R16	.	hash constant, 0x1e35a7bd
+//	- R17	.	&table
+//	- .  	128	table
+//
+// The second column (64, 72, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
+TEXT ·encodeBlock(SB), 0, $32896-56
+	MOVD dst_base+0(FP), R8
+	MOVD src_base+24(FP), R7
+	MOVD src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVD  $24, R5
+	MOVD  $256, R6
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	MOVD $16384, R2
+	CMP  R2, R6
+	BGE  varTable
+	CMP  R14, R6
+	BGE  varTable
+	SUB  $1, R5, R5
+	LSL  $1, R6, R6
+	B    calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each
+	// iterations writes 64 bytes, so we can do only tableSize/32 writes
+	// instead of the 2048 writes that would zero-initialize all of table's
+	// 32768 bytes. This clear could overrun the first tableSize elements, but
+	// it won't overrun the allocated stack size.
+	ADD  $128, RSP, R17
+	MOVD R17, R4
+
+	// !!! R6 = &src[tableSize]
+	ADD R6<<1, R17, R6
+
+memclr:
+	STP.P (ZR, ZR), 64(R4)
+	STP   (ZR, ZR), -48(R4)
+	STP   (ZR, ZR), -32(R4)
+	STP   (ZR, ZR), -16(R4)
+	CMP   R4, R6
+	BHI   memclr
+
+	// !!! R6 = &src[0]
+	MOVD R7, R6
+
+	// sLimit := len(src) - inputMargin
+	MOVD R14, R9
+	SUB  $15, R9, R9
+
+	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVD R5, 64(RSP)
+	MOVD R6, 72(RSP)
+	MOVD R9, 96(RSP)
+
+	// nextEmit := 0
+	MOVD R6, R10
+
+	// s := 1
+	ADD $1, R7, R7
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVW 0(R7), R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVD $32, R12
+
+	// nextS := s
+	MOVD R7, R13
+
+	// candidate := 0
+	MOVD $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVD R13, R7
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVD R12, R14
+	LSR  $5, R14, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADD R14, R13, R13
+
+	// skip += bytesBetweenHashLookups
+	ADD R14, R12, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVD R13, R3
+	SUB  R6, R3, R3
+	CMP  R9, R3
+	BHI  emitRemainder
+
+	// candidate = int(table[nextHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[nextHash] = uint16(s)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+
+	MOVH R3, 0(R17)(R11<<1)
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVW 0(R13), R11
+	MULW R16, R11
+	LSRW R5, R11, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVW 0(R7), R3
+	MOVW (R6)(R15*1), R4
+	CMPW R4, R3
+	BNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVD R7, R3
+	SUB  R10, R3, R3
+	CMP  $16, R3
+	BLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	MOVW $60, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralOneByte
+	MOVW $256, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVD $0xf4, R1
+	MOVB R1, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVD $0xf0, R1
+	MOVB R1, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADD   R3, R8, R8
+	MOVD  R7, 80(RSP)
+	MOVD  R8, 88(RSP)
+	MOVD  R15, 120(RSP)
+	CALL  runtime·memmove(SB)
+	MOVD  64(RSP), R5
+	MOVD  72(RSP), R6
+	MOVD  80(RSP), R7
+	MOVD  88(RSP), R8
+	MOVD  96(RSP), R9
+	MOVD  120(RSP), R15
+	ADD   $128, RSP, R17
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+	B     inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB R3, R4
+	SUBW $1, R4, R4
+	AND  $0xff, R4, R4
+	LSLW $2, R4, R4
+	MOVB R4, (R8)
+	ADD  $1, R8, R8
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R10), (R0, R1)
+	STP (R0, R1), 0(R8)
+	ADD R3, R8, R8
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVD R7, R12
+
+	// !!! offset := base - candidate
+	MOVD R12, R11
+	SUB  R15, R11, R11
+	SUB  R6, R11, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVD src_len+32(FP), R14
+	ADD  R6, R14, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADD $4, R15, R15
+	ADD R6, R15, R15
+
+	// !!! s += 4
+	ADD $4, R7, R7
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  inlineExtendMatchCmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchBSF
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+	B    inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  inlineExtendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVD R7, R3
+	SUB  R12, R3, R3
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	MOVW $68, R2
+	CMPW R2, R3
+	BLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $64, R3, R3
+	B    inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	MOVW $64, R2
+	CMPW R2, R3
+	BLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $60, R3, R3
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	MOVW $12, R2
+	CMPW R2, R3
+	BGE  inlineEmitCopyStep3
+	MOVW $2048, R2
+	CMPW R2, R11
+	BGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $8, R11, R11
+	LSLW $5, R11, R11
+	SUBW $4, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBW $1, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVD R7, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	CMP  R3, R9
+	BLS  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVD -1(R7), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	SUB  $1, R3, R3
+
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// currHash := hash(uint32(x>>8), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// candidate = int(table[currHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[currHash] = uint16(s)
+	ADD   $1, R3, R3
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVW (R6)(R15*1), R4
+	CMPW R4, R14
+	BEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// s++
+	ADD $1, R7, R7
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	B outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVD src_len+32(FP), R3
+	ADD  R6, R3, R3
+	CMP  R3, R10
+	BEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVD R8, 8(RSP)
+	MOVD $0, 16(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD $0, 24(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD R10, 32(RSP)
+	SUB  R10, R3, R3
+	MOVD R3, 40(RSP)
+	MOVD R3, 48(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVD R8, 88(RSP)
+	CALL ·emitLiteral(SB)
+	MOVD 88(RSP), R8
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVD 56(RSP), R1
+	ADD  R1, R8, R8
+
+encodeBlockEnd:
+	MOVD dst_base+0(FP), R3
+	SUB  R3, R8, R8
+	MOVD R8, d+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go b/vendor/github.com/golang/snappy/encode_asm.go
similarity index 97%
rename from vendor/github.com/klauspost/compress/snappy/encode_amd64.go
rename to vendor/github.com/golang/snappy/encode_asm.go
index 150d91bc8..107c1e714 100644
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
+++ b/vendor/github.com/golang/snappy/encode_asm.go
@@ -5,6 +5,7 @@
 // +build !appengine
 // +build gc
 // +build !noasm
+// +build amd64 arm64
 
 package snappy
 
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_other.go b/vendor/github.com/golang/snappy/encode_other.go
similarity index 99%
rename from vendor/github.com/klauspost/compress/snappy/encode_other.go
rename to vendor/github.com/golang/snappy/encode_other.go
index dbcae905e..296d7f0be 100644
--- a/vendor/github.com/klauspost/compress/snappy/encode_other.go
+++ b/vendor/github.com/golang/snappy/encode_other.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
+// +build !amd64,!arm64 appengine !gc noasm
 
 package snappy
 
diff --git a/vendor/github.com/golang/snappy/go.mod b/vendor/github.com/golang/snappy/go.mod
new file mode 100644
index 000000000..f6406bb2c
--- /dev/null
+++ b/vendor/github.com/golang/snappy/go.mod
@@ -0,0 +1 @@
+module github.com/golang/snappy
diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/golang/snappy/snappy.go
similarity index 98%
rename from vendor/github.com/klauspost/compress/snappy/snappy.go
rename to vendor/github.com/golang/snappy/snappy.go
index 74a36689e..ece692ea4 100644
--- a/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ b/vendor/github.com/golang/snappy/snappy.go
@@ -17,7 +17,7 @@
 //
 // The canonical, C++ implementation is at https://github.com/google/snappy and
 // it only implements the block format.
-package snappy
+package snappy // import "github.com/golang/snappy"
 
 import (
 	"hash/crc32"
diff --git a/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md b/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md
index dfc32c8c2..dae40a50f 100644
--- a/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md
+++ b/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md
@@ -1,5 +1,15 @@
 # libhoney Changelog
 
+## 1.15.3 2021-06-02
+
+### Improvements
+
+- Add more context to batch response parsing error (#116)
+
+### Maintenance
+
+- Add go 1.15 & 1.16 to the testing matrix (#114, #119)
+
 ## 1.15.2 2021-01-22
 
 NOTE: v1.15.1 may cause update warnings due to checksum error, please use v1.15.2 instead.
diff --git a/vendor/github.com/honeycombio/libhoney-go/go.mod b/vendor/github.com/honeycombio/libhoney-go/go.mod
index 352bbd8c2..6c54f3eeb 100644
--- a/vendor/github.com/honeycombio/libhoney-go/go.mod
+++ b/vendor/github.com/honeycombio/libhoney-go/go.mod
@@ -3,7 +3,7 @@ module github.com/honeycombio/libhoney-go
 go 1.14
 
 require (
-	github.com/DataDog/zstd v1.4.5
+	github.com/DataDog/zstd v1.4.8
 	github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect
 	github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c // indirect
 	github.com/facebookgo/limitgroup v0.0.0-20150612190941-6abd8d71ec01 // indirect
@@ -11,7 +11,7 @@ require (
 	github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 // indirect
 	github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 // indirect
 	github.com/golang/protobuf v1.3.5 // indirect
-	github.com/klauspost/compress v1.11.4
+	github.com/klauspost/compress v1.12.2
 	github.com/stretchr/testify v1.6.1
 	github.com/vmihailenco/msgpack/v4 v4.3.12
 	golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e // indirect
diff --git a/vendor/github.com/honeycombio/libhoney-go/go.sum b/vendor/github.com/honeycombio/libhoney-go/go.sum
index fdd0d07dd..a32a732a1 100644
--- a/vendor/github.com/honeycombio/libhoney-go/go.sum
+++ b/vendor/github.com/honeycombio/libhoney-go/go.sum
@@ -1,6 +1,5 @@
-github.com/DataDog/zstd v1.4.5 h1:EndNeuB0l9syBZhut0wns3gV1hL8zX8LIu6ZiVHWLIQ=
-github.com/DataDog/zstd v1.4.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
-github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/DataDog/zstd v1.4.8 h1:Rpmta4xZ/MgZnriKNd24iZMhGpP5dvUcs/uqfBapKZY=
+github.com/DataDog/zstd v1.4.8/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -16,14 +15,14 @@ github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 h1:JWuenKqqX8nojt
 github.com/facebookgo/stack v0.0.0-20160209184415-751773369052/go.mod h1:UbMTZqLaRiH3MsBH8va0n7s1pQYcu3uTb8G4tygF4Zg=
 github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 h1:7HZCaLC5+BZpmbhCOZJ293Lz68O7PYrF2EzeiFMwCLk=
 github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0=
-github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.4 h1:87PNWwrRvUSnqS4dlcBU/ftvOIBep4sYuBLlh6rX2wk=
 github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
 github.com/golang/protobuf v1.3.5 h1:F768QJ1E9tib+q5Sc8MkdJi1RxLTbRcTf8LJV56aRls=
 github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
-github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU=
-github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
+github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/klauspost/compress v1.12.2 h1:2KCfW3I9M7nSc5wOqXAlW2v2U6v+w6cbjvbfp+OykW8=
+github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -31,7 +30,6 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -40,9 +38,7 @@ github.com/vmihailenco/msgpack/v4 v4.3.12/go.mod h1:gborTTJjAo/GWTqqRjrLCn9pgNN+
 github.com/vmihailenco/tagparser v0.1.1 h1:quXMXlA39OCbd2wAdTsGDlK9RkOk6Wuw+x37wVyIuWY=
 github.com/vmihailenco/tagparser v0.1.1/go.mod h1:OeAg3pn3UbLjkWt+rN9oFYB6u/cQgqMEUPoW2WPyhdI=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0Ola2caSKcY69NUBZrRQ=
 golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
-golang.org/x/net v0.0.0-20200301022130-244492dfa37a h1:GuSPYbZzB5/dcLNCwLQLsg3obCJtX9IJhpXkvY7kzk0=
 golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
@@ -55,7 +51,6 @@ google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpC
 google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
 gopkg.in/alexcesaro/statsd.v2 v2.0.0 h1:FXkZSCZIH17vLCO5sO2UucTHsH9pc+17F6pl3JVCwMc=
 gopkg.in/alexcesaro/statsd.v2 v2.0.0/go.mod h1:i0ubccKGzBVNBpdGV5MocxyA/XlLUJzA7SLonnE4drU=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/vendor/github.com/honeycombio/libhoney-go/libhoney.go b/vendor/github.com/honeycombio/libhoney-go/libhoney.go
index 2a3ea9c29..d2043981e 100644
--- a/vendor/github.com/honeycombio/libhoney-go/libhoney.go
+++ b/vendor/github.com/honeycombio/libhoney-go/libhoney.go
@@ -33,7 +33,7 @@ const (
 	defaultSampleRate = 1
 	defaultAPIHost    = "https://api.honeycomb.io/"
 	defaultDataset    = "libhoney-go dataset"
-	version           = "1.15.2"
+	version           = "1.15.3"
 
 	// DefaultMaxBatchSize how many events to collect in a batch
 	DefaultMaxBatchSize = 50
diff --git a/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go b/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go
index d26f22ffd..1bdf21cbd 100644
--- a/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go
+++ b/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go
@@ -480,7 +480,11 @@ func (b *batchAgg) fireBatch(events []*Event) {
 	if err != nil {
 		// if we can't decode the responses, just error out all of them
 		b.metrics.Increment("response_decode_errors")
-		b.enqueueErrResponses(err, events, dur/time.Duration(numEncoded))
+		b.enqueueErrResponses(fmt.Errorf(
+			"got OK HTTP response, but couldn't read response body: %v", err),
+			events,
+			dur/time.Duration(numEncoded),
+		)
 		return
 	}
 
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index b69237c9b..6f341914c 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -92,7 +92,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, tableLog uint8, first symbolTra
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + first.deltaFindState
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
@@ -301,7 +300,7 @@ func (s *Scratch) writeCount() error {
 	out[outP+1] = byte(bitStream >> 8)
 	outP += (bitCount + 7) / 8
 
-	if uint16(charnum) > s.symbolLen {
+	if charnum > s.symbolLen {
 		return errors.New("internal error: charnum > s.symbolLen")
 	}
 	s.Out = out[:outP]
@@ -331,7 +330,7 @@ type cTable struct {
 func (s *Scratch) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -565,8 +564,8 @@ func (s *Scratch) normalizeCount2() error {
 		distributed  uint32
 		total        = uint32(s.br.remain())
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -591,7 +590,7 @@ func (s *Scratch) normalizeCount2() error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 413ec3b3c..926f5f153 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -172,7 +172,7 @@ type decSymbol struct {
 // allocDtable will allocate decoding tables if they are not big enough.
 func (s *Scratch) allocDtable() {
 	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < int(tableSize) {
+	if cap(s.decTable) < tableSize {
 		s.decTable = make([]decSymbol, tableSize)
 	}
 	s.decTable = s.decTable[:tableSize]
@@ -340,7 +340,7 @@ type decoder struct {
 func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
 	d.dt = dt
 	d.br = in
-	d.state = uint16(in.getBits(tableLog))
+	d.state = in.getBits(tableLog)
 }
 
 // next returns the next symbol and sets the next state.
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index e12da4db2..8b6e5c663 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -14,7 +14,9 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 
 ## News
 
- * Mar 2018: First implementation released. Consider this beta software for now.
+This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.
+
+This ensures that most functionality is well tested.
 
 # Usage
 
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index f9ed5f830..0823c928c 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -403,7 +403,7 @@ func (s *Scratch) buildCTable() error {
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1
 
-	nodeNb := int16(startNode)
+	nodeNb := startNode
 	huffNode := s.nodes[1 : huffNodesLen+1]
 
 	// This overlays the slice above, but allows "-1" index lookups.
@@ -536,7 +536,6 @@ func (s *Scratch) huffSort() {
 		}
 		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
 	}
-	return
 }
 
 func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
@@ -580,7 +579,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 
 		// Get pos of last (smallest) symbol per rank
 		{
-			currentNbBits := uint8(maxNbBits)
+			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
 				if huffNode[pos].nbBits >= currentNbBits {
 					continue
diff --git a/vendor/github.com/klauspost/compress/snappy/runbench.cmd b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
deleted file mode 100644
index d24eb4b47..000000000
--- a/vendor/github.com/klauspost/compress/snappy/runbench.cmd
+++ /dev/null
@@ -1,2 +0,0 @@
-del old.txt
-go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index 7680bfe1d..e7d7eb0ae 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -152,8 +152,8 @@ This package:
 file    out     level   insize      outsize     millis  mb/s
 silesia.tar zskp    1   211947520   73101992    643     313.87
 silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   65177448    1899    106.44
-silesia.tar zskp    4   211947520   61381950    8115    24.91
+silesia.tar zskp    3   211947520   64595893    2007    100.68
+silesia.tar zskp    4   211947520   60995370    7691    26.28
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -171,8 +171,8 @@ https://files.klauspost.com/compress/gob-stream.7z
 file        out     level   insize  outsize     millis  mb/s
 gob-stream  zskp    1   1911399616  235022249   3088    590.30
 gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  185792019   9324    195.48
-gob-stream  zskp    4   1911399616  171537212   32113   56.76
+gob-stream  zskp    3   1911399616  175034659   9636    189.17
+gob-stream  zskp    4   1911399616  167273881   29337   62.13
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
@@ -187,8 +187,8 @@ http://mattmahoney.net/dc/textdata.html
 file    out level   insize      outsize     millis  mb/s
 enwik9  zskp    1   1000000000  343848582   3609    264.18
 enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  294540704   11725   81.34
-enwik9  zskp    4   1000000000  276609671   44029   21.66
+enwik9  zskp    3   1000000000  292243069   12162   78.41
+enwik9  zskp    4   1000000000  275241169   36430   26.18
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
@@ -202,8 +202,8 @@ https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 file                        out level   insize      outsize     millis  mb/s
 github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
 github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  537511906   29252   204.54
-github-june-2days-2019.json zskp    4   6273951764  512796117   97791   61.18
+github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
+github-june-2days-2019.json zskp    4   6273951764  503314661   93811   63.78
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
@@ -217,8 +217,8 @@ https://files.klauspost.com/compress/rawstudio-mint14.7z
 file                    out level   insize      outsize     millis  mb/s
 rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
 rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3224594213  71751   113.75
-rawstudio-mint14.tar    zskp    4   8558382592  3027332295  486243  16.79
+rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
+rawstudio-mint14.tar    zskp    4   8558382592  3020370044  404956  20.16
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
@@ -232,8 +232,8 @@ https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 file                    out level   insize      outsize     millis  mb/s
 nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
 nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  538490114   19880   159.53
-nyc-taxi-data-10M.csv   zskp    4   3325605752  495986829   89368   35.49
+nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
+nyc-taxi-data-10M.csv   zskp    4   3325605752  490907191   65939   48.10
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index c85c40255..e1be092f3 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -22,28 +22,44 @@ type blockEnc struct {
 	dictLitEnc *huff0.Scratch
 	wr         bitWriter
 
-	extraLits int
-	last      bool
-
+	extraLits         int
 	output            []byte
 	recentOffsets     [3]uint32
 	prevRecentOffsets [3]uint32
+
+	last   bool
+	lowMem bool
 }
 
 // init should be used once the block has been created.
 // If called more than once, the effect is the same as calling reset.
 func (b *blockEnc) init() {
-	if cap(b.literals) < maxCompressedLiteralSize {
-		b.literals = make([]byte, 0, maxCompressedLiteralSize)
-	}
-	const defSeqs = 200
-	b.literals = b.literals[:0]
-	if cap(b.sequences) < defSeqs {
-		b.sequences = make([]seq, 0, defSeqs)
-	}
-	if cap(b.output) < maxCompressedBlockSize {
-		b.output = make([]byte, 0, maxCompressedBlockSize)
+	if b.lowMem {
+		// 1K literals
+		if cap(b.literals) < 1<<10 {
+			b.literals = make([]byte, 0, 1<<10)
+		}
+		const defSeqs = 20
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		// 1K
+		if cap(b.output) < 1<<10 {
+			b.output = make([]byte, 0, 1<<10)
+		}
+	} else {
+		if cap(b.literals) < maxCompressedBlockSize {
+			b.literals = make([]byte, 0, maxCompressedBlockSize)
+		}
+		const defSeqs = 200
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		if cap(b.output) < maxCompressedBlockSize {
+			b.output = make([]byte, 0, maxCompressedBlockSize)
+		}
 	}
+
 	if b.coders.mlEnc == nil {
 		b.coders.mlEnc = &fseEncoder{}
 		b.coders.mlPrev = &fseEncoder{}
@@ -370,9 +386,9 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 		b.output = bh.appendTo(b.output)
 		b.output = append(b.output, lits[0])
 		return nil
+	case nil:
 	default:
 		return err
-	case nil:
 	}
 	// Compressed...
 	// Now, allow reuse
@@ -512,11 +528,6 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		if debug {
 			println("Adding literals RLE")
 		}
-	default:
-		if debug {
-			println("Adding literals ERROR:", err)
-		}
-		return err
 	case nil:
 		// Compressed litLen...
 		if reUsed {
@@ -547,6 +558,11 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		if debug {
 			println("Adding literals compressed")
 		}
+	default:
+		if debug {
+			println("Adding literals ERROR:", err)
+		}
+		return err
 	}
 	// Sequence compression
 
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 87896c5ea..69736e8d4 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -93,7 +93,7 @@ func (h *Header) Decode(in []byte) error {
 	h.HasCheckSum = fhd&(1<<2) != 0
 
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return errors.New("reserved bit set on frame header")
 	}
 
 	// Read Window_Descriptor
@@ -174,7 +174,7 @@ func (h *Header) Decode(in []byte) error {
 	if len(in) < 3 {
 		return nil
 	}
-	tmp, in := in[:3], in[3:]
+	tmp := in[:3]
 	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
 	h.FirstBlock.Last = bh&1 != 0
 	blockType := blockType((bh >> 1) & 3)
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index cdda0de58..f593e464b 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,7 +5,6 @@
 package zstd
 
 import (
-	"bytes"
 	"errors"
 	"io"
 	"sync"
@@ -85,6 +84,10 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	d.current.output = make(chan decodeOutput, d.o.concurrent)
 	d.current.flushed = true
 
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+	}
+
 	// Transfer option dicts.
 	d.dicts = make(map[uint32]dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
@@ -111,7 +114,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
 	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
+		return 0, ErrDecoderNilInput
 	}
 	var n int
 	for {
@@ -152,12 +155,20 @@ func (d *Decoder) Read(p []byte) (int, error) {
 
 // Reset will reset the decoder the supplied stream after the current has finished processing.
 // Note that this functionality cannot be used after Close has been called.
+// Reset can be called with a nil reader to release references to the previous reader.
+// After being called with a nil reader, no other operations than Reset or DecodeAll or Close
+// should be used.
 func (d *Decoder) Reset(r io.Reader) error {
 	if d.current.err == ErrDecoderClosed {
 		return d.current.err
 	}
+
+	d.drainOutput()
+
 	if r == nil {
-		return errors.New("nil Reader sent as input")
+		d.current.err = ErrDecoderNilInput
+		d.current.flushed = true
+		return nil
 	}
 
 	if d.stream == nil {
@@ -166,14 +177,13 @@ func (d *Decoder) Reset(r io.Reader) error {
 		go d.startStreamDecoder(d.stream)
 	}
 
-	d.drainOutput()
-
 	// If bytes buffer and < 1MB, do sync decoding anyway.
-	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
+	if bb, ok := r.(byter); ok && bb.Len() < 1<<20 {
+		bb2 := bb
 		if debug {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
-		b := bb.Bytes()
+		b := bb2.Bytes()
 		var dst []byte
 		if cap(d.current.b) > 0 {
 			dst = d.current.b
@@ -226,20 +236,17 @@ func (d *Decoder) drainOutput() {
 		println("current already flushed")
 		return
 	}
-	for {
-		select {
-		case v := <-d.current.output:
-			if v.d != nil {
-				if debug {
-					printf("re-adding decoder %p", v.d)
-				}
-				d.decoders <- v.d
-			}
-			if v.err == errEndOfStream {
-				println("current flushed")
-				d.current.flushed = true
-				return
+	for v := range d.current.output {
+		if v.d != nil {
+			if debug {
+				printf("re-adding decoder %p", v.d)
 			}
+			d.decoders <- v.d
+		}
+		if v.err == errEndOfStream {
+			println("current flushed")
+			d.current.flushed = true
+			return
 		}
 	}
 }
@@ -249,7 +256,7 @@ func (d *Decoder) drainOutput() {
 // Any error encountered during the write is also returned.
 func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
+		return 0, ErrDecoderNilInput
 	}
 	var n int64
 	for {
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 284d38449..c0fd058c2 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -6,7 +6,6 @@ package zstd
 
 import (
 	"errors"
-	"fmt"
 	"runtime"
 )
 
@@ -43,7 +42,7 @@ func WithDecoderLowmem(b bool) DOption {
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
 		if n <= 0 {
-			return fmt.Errorf("Concurrency must be at least 1")
+			return errors.New("concurrency must be at least 1")
 		}
 		o.concurrent = n
 		return nil
@@ -61,7 +60,7 @@ func WithDecoderMaxMemory(n uint64) DOption {
 			return errors.New("WithDecoderMaxMemory must be at least 1")
 		}
 		if n > 1<<63 {
-			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
+			return errors.New("WithDecoderMaxmemory must be less than 1 << 63")
 		}
 		o.maxDecodedSize = n
 		return nil
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index b1b7c6e6a..60f298648 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -7,6 +7,10 @@ import (
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
+const (
+	dictShardBits = 6
+)
+
 type fastBase struct {
 	// cur is the offset at the start of hist
 	cur int32
@@ -17,6 +21,7 @@ type fastBase struct {
 	tmp         [8]byte
 	blk         *blockEnc
 	lastDictID  uint32
+	lowMem      bool
 }
 
 // CRC returns the underlying CRC writer.
@@ -57,15 +62,10 @@ func (e *fastBase) addBlock(src []byte) int32 {
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
 		if cap(e.hist) == 0 {
-			l := e.maxMatchOff * 2
-			// Make it at least 1MB.
-			if l < 1<<20 {
-				l = 1 << 20
-			}
-			e.hist = make([]byte, 0, l)
+			e.ensureHist(len(src))
 		} else {
-			if cap(e.hist) < int(e.maxMatchOff*2) {
-				panic("unexpected buffer size")
+			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
+				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
 			}
 			// Move down
 			offset := int32(len(e.hist)) - e.maxMatchOff
@@ -79,6 +79,28 @@ func (e *fastBase) addBlock(src []byte) int32 {
 	return s
 }
 
+// ensureHist will ensure that history can keep at least this many bytes.
+func (e *fastBase) ensureHist(n int) {
+	if cap(e.hist) >= n {
+		return
+	}
+	l := e.maxMatchOff
+	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
+		l += maxCompressedBlockSize
+	} else {
+		l += e.maxMatchOff
+	}
+	// Make it at least 1MB.
+	if l < 1<<20 && !e.lowMem {
+		l = 1 << 20
+	}
+	// Make it at least the requested size.
+	if l < int32(n) {
+		l = int32(n)
+	}
+	e.hist = make([]byte, 0, l)
+}
+
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
 func (e *fastBase) UseBlock(enc *blockEnc) {
@@ -117,7 +139,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 // Reset the encoding table.
 func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 	if e.blk == nil {
-		e.blk = &blockEnc{}
+		e.blk = &blockEnc{lowMem: e.lowMem}
 		e.blk.init()
 	} else {
 		e.blk.reset(nil)
@@ -128,14 +150,15 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 	} else {
 		e.crc.Reset()
 	}
-	if (!singleBlock || d.DictContentSize() > 0) && cap(e.hist) < int(e.maxMatchOff*2)+d.DictContentSize() {
-		l := e.maxMatchOff*2 + int32(d.DictContentSize())
-		// Make it at least 1MB.
-		if l < 1<<20 {
-			l = 1 << 20
+	if d != nil {
+		low := e.lowMem
+		if singleBlock {
+			e.lowMem = true
 		}
-		e.hist = make([]byte, 0, l)
+		e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+		e.lowMem = low
 	}
+
 	// We offset current position so everything will be out of reach.
 	// If above reset line, history will be purged.
 	if e.cur < bufferReset {
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
index c4baa42c6..dc1eed5f0 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -112,7 +112,7 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 	// Override src
 	src = e.hist
 	sLimit := int32(len(src)) - inputMargin
-	const kSearchStrength = 12
+	const kSearchStrength = 10
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -186,9 +186,11 @@ encodeLoop:
 			best = bestOf(best, matchAt(s-offset1+1, s+1, uint32(cv>>8), 1))
 			best = bestOf(best, matchAt(s-offset2+1, s+1, uint32(cv>>8), 2))
 			best = bestOf(best, matchAt(s-offset3+1, s+1, uint32(cv>>8), 3))
-			best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
-			best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
-			best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+			if best.length > 0 {
+				best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
+				best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
+				best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+			}
 		}
 		// Load next and check...
 		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
@@ -218,6 +220,20 @@ encodeLoop:
 			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
 			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
 			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+
+			// See if we can find a better match by checking where the current best ends.
+			// Use that offset to see if we can find a better full match.
+			if sAt := best.s + best.length; sAt < sLimit {
+				nextHashL := hash8(load6432(src, sAt), bestLongTableBits)
+				candidateEnd := e.longTable[nextHashL]
+				if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
+					bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
+					if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
+						bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+					}
+					best = bestEnd
+				}
+			}
 		}
 
 		// We have a match, we can store the forward value
@@ -405,6 +421,7 @@ encodeLoop:
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
 	e.Encode(blk, src)
 }
 
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index 94a5343d0..604954290 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -16,29 +16,558 @@ const (
 	// This greatly depends on the type of input.
 	betterShortTableBits = 13                        // Bits used in the short match table
 	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+
+	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
+	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
+
+	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
+	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
 )
 
-type prevEntry struct {
-	offset int32
-	prev   int32
+type prevEntry struct {
+	offset int32
+	prev   int32
+}
+
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+}
+
+type betterFastEncoderDict struct {
+	betterFastEncoder
+	dictTable            []tableEntry
+	dictLongTable        []prevEntry
+	shortTableShardDirty [betterShortTableShardCnt]bool
+	longTableShardDirty  [betterLongTableShardCnt]bool
+	allDirty             bool
+}
+
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if true && s+matched < sLimit {
+			nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
 }
 
-// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
-// The long match table contains the previous entry with the same hash,
-// effectively making it a "chain" of length 2.
-// When we find a long match we choose between the two values and select the longest.
-// When we find a short match, after checking the long, we check if we can find a long at n+1
-// and that it is longer (lazy matching).
-type betterFastEncoder struct {
-	fastBase
-	table         [betterShortTableSize]tableEntry
-	longTable     [betterLongTableSize]prevEntry
-	dictTable     []tableEntry
-	dictLongTable []prevEntry
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
+	e.Encode(blk, src)
 }
 
 // Encode improves compression...
-func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 	const (
 		// Input margin is the number of bytes we read (8)
 		// and the maximum we will read ahead (2)
@@ -56,6 +585,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 				e.longTable[i] = prevEntry{}
 			}
 			e.cur = e.maxMatchOff
+			e.allDirty = true
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
@@ -88,6 +618,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 				prev:   v2,
 			}
 		}
+		e.allDirty = true
 		e.cur = e.maxMatchOff
 		break
 	}
@@ -150,7 +681,9 @@ encodeLoop:
 			repIndex := s - offset1 + repOff
 			off := s + e.cur
 			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
@@ -204,7 +737,10 @@ encodeLoop:
 						h0 := hash8(cv0, betterLongTableBits)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -265,7 +801,10 @@ encodeLoop:
 						h0 := hash8(cv0, betterLongTableBits)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -346,6 +885,7 @@ encodeLoop:
 
 				// We can store it, since we have at least a 4 byte match.
 				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				e.markLongShardDirty(nextHashL)
 				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 					// Found a long match, at least 8 bytes.
 					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
@@ -398,9 +938,41 @@ encodeLoop:
 			}
 			cv = load6432(src, s)
 		}
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
 
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
 		offset2 = offset1
 		offset1 = s - t
 
@@ -452,7 +1024,10 @@ encodeLoop:
 			h0 := hash8(cv0, betterLongTableBits)
 			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markLongShardDirty(h0)
+			h1 := hash5(cv1, betterShortTableBits)
+			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markShortShardDirty(h1)
 			index0 += 2
 		}
 
@@ -478,7 +1053,9 @@ encodeLoop:
 			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
 
@@ -512,15 +1089,16 @@ encodeLoop:
 	}
 }
 
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	e.Encode(blk, src)
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("betterFastEncoder: Reset with dict")
+	}
 }
 
 // ResetDict will reset and set a dictionary if not nil
-func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@@ -557,6 +1135,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 
 	// Init or copy dict table
@@ -585,11 +1164,72 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
+
 	// Reset table to initial state
-	copy(e.longTable[:], e.dictLongTable)
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterShortTableShardCnt
+		const shardSize = betterShortTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.table[:], e.dictTable)
+			for i := range e.shortTableShardDirty {
+				e.shortTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.shortTableShardDirty {
+				if !e.shortTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+				e.shortTableShardDirty[i] = false
+			}
+		}
+	}
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterLongTableShardCnt
+		const shardSize = betterLongTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.longTable[:], e.dictLongTable)
+			for i := range e.longTableShardDirty {
+				e.longTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.longTableShardDirty {
+				if !e.longTableShardDirty[i] {
+					continue
+				}
 
+				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
+				e.longTableShardDirty[i] = false
+			}
+		}
+	}
 	e.cur = e.maxMatchOff
-	// Reset table to initial state
-	copy(e.table[:], e.dictTable)
+	e.allDirty = false
+}
+
+func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
+}
+
+func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
+	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 19eebf66e..8629d43d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -11,6 +11,9 @@ const (
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
 	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
 
+	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
+	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
+
 	dFastShortTableBits = tableBits                // Bits used in the short match table
 	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
 	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
@@ -18,8 +21,14 @@ const (
 
 type doubleFastEncoder struct {
 	fastEncoder
-	longTable     [dFastLongTableSize]tableEntry
-	dictLongTable []tableEntry
+	longTable [dFastLongTableSize]tableEntry
+}
+
+type doubleFastEncoderDict struct {
+	fastEncoderDict
+	longTable           [dFastLongTableSize]tableEntry
+	dictLongTable       []tableEntry
+	longTableShardDirty [dLongTableShardCnt]bool
 }
 
 // Encode mimmics functionality in zstd_dfast.c
@@ -678,9 +687,379 @@ encodeLoop:
 	}
 }
 
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.markAllShardsDirty()
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.markAllShardsDirty()
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, dFastLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				e.markLongShardDirty(nextHashL)
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		longHash1 := hash8(cv0, dFastLongTableBits)
+		longHash2 := hash8(cv0, dFastLongTableBits)
+		e.longTable[longHash1] = te0
+		e.longTable[longHash2] = te1
+		e.markLongShardDirty(longHash1)
+		e.markLongShardDirty(longHash2)
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		hashVal1 := hash5(cv0, dFastShortTableBits)
+		hashVal2 := hash5(cv1, dFastShortTableBits)
+		e.table[hashVal1] = te0
+		e.markShardDirty(hashVal1)
+		e.table[hashVal2] = te1
+		e.markShardDirty(hashVal2)
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+	// If we encoded more than 64K mark all dirty.
+	if len(src) > 64<<10 {
+		e.markAllShardsDirty()
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 	e.fastEncoder.Reset(d, singleBlock)
+	if d != nil {
+		panic("doubleFastEncoder: Reset with dict not supported")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	allDirty := e.allDirty
+	e.fastEncoderDict.Reset(d, singleBlock)
 	if d == nil {
 		return
 	}
@@ -706,8 +1085,37 @@ func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 	// Reset table to initial state
 	e.cur = e.maxMatchOff
-	copy(e.longTable[:], e.dictLongTable)
+
+	dirtyShardCnt := 0
+	if !allDirty {
+		for i := range e.longTableShardDirty {
+			if e.longTableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
+		copy(e.longTable[:], e.dictLongTable)
+		for i := range e.longTableShardDirty {
+			e.longTableShardDirty[i] = false
+		}
+		return
+	}
+	for i := range e.longTableShardDirty {
+		if !e.longTableShardDirty[i] {
+			continue
+		}
+
+		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		e.longTableShardDirty[i] = false
+	}
+}
+
+func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 0b301df43..ba4a17e10 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -11,9 +11,11 @@ import (
 )
 
 const (
-	tableBits      = 15             // Bits used in the table
-	tableSize      = 1 << tableBits // Size of the table
-	tableMask      = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	tableBits      = 15                               // Bits used in the table
+	tableSize      = 1 << tableBits                   // Size of the table
+	tableShardCnt  = 1 << (tableBits - dictShardBits) // Number of shards in the table
+	tableShardSize = tableSize / tableShardCnt        // Size of an individual shard
+	tableMask      = tableSize - 1                    // Mask for table indices. Redundant, but can eliminate bounds checks.
 	maxMatchLength = 131074
 )
 
@@ -24,8 +26,14 @@ type tableEntry struct {
 
 type fastEncoder struct {
 	fastBase
-	table     [tableSize]tableEntry
-	dictTable []tableEntry
+	table [tableSize]tableEntry
+}
+
+type fastEncoderDict struct {
+	fastEncoder
+	dictTable       []tableEntry
+	tableShardDirty [tableShardCnt]bool
+	allDirty        bool
 }
 
 // Encode mimmics functionality in zstd_fast.c
@@ -78,7 +86,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 7
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -617,8 +625,322 @@ encodeLoop:
 	}
 }
 
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if e.allDirty || len(src) > 32<<10 {
+		e.fastEncoder.Encode(blk, src)
+		e.allDirty = true
+		return
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 7
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+			e.markShardDirty(nextHash2)
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debug {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
+
+			// Store this, since we have it.
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("fastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@@ -653,9 +975,44 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 
 	e.cur = e.maxMatchOff
-	// Reset table to initial state
-	copy(e.table[:], e.dictTable)
+	dirtyShardCnt := 0
+	if !e.allDirty {
+		for i := range e.tableShardDirty {
+			if e.tableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	const shardCnt = tableShardCnt
+	const shardSize = tableShardSize
+	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+		copy(e.table[:], e.dictTable)
+		for i := range e.tableShardDirty {
+			e.tableShardDirty[i] = false
+		}
+		e.allDirty = false
+		return
+	}
+	for i := range e.tableShardDirty {
+		if !e.tableShardDirty[i] {
+			continue
+		}
+
+		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		e.tableShardDirty[i] = false
+	}
+	e.allDirty = false
+}
+
+func (e *fastEncoderDict) markAllShardsDirty() {
+	e.allDirty = true
+}
+
+func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
+	e.tableShardDirty[entryNum/tableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index f5759211d..4871dd03a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -106,7 +106,7 @@ func (e *Encoder) Reset(w io.Writer) {
 		s.encoder = e.o.encoder()
 	}
 	if s.writing == nil {
-		s.writing = &blockEnc{}
+		s.writing = &blockEnc{lowMem: e.o.lowMem}
 		s.writing.init()
 	}
 	s.writing.initNewEncode()
@@ -176,6 +176,12 @@ func (e *Encoder) nextBlock(final bool) error {
 	}
 	if !s.headerWritten {
 		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) == 0 && !e.o.fullZero {
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
 		if final && len(s.filling) > 0 {
 			s.current = e.EncodeAll(s.filling, s.current[:0])
 			var n2 int
@@ -334,13 +340,13 @@ func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
 				println("ReadFrom: got EOF final block:", len(e.state.filling))
 			}
 			return n, nil
+		case nil:
 		default:
 			if debug {
 				println("ReadFrom: got error:", err)
 			}
 			e.state.err = err
 			return n, err
-		case nil:
 		}
 		if len(src) > 0 {
 			if debug {
@@ -471,7 +477,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}
 
 	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
 	dst, err := fh.appendTo(dst)
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index a7312f42a..16d4ab63c 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -24,12 +24,12 @@ type encoderOptions struct {
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
+	lowMem          bool
 	dict            *dict
 }
 
 func (o *encoderOptions) setDefault() {
 	*o = encoderOptions{
-		// use less ram: true for now, but may change.
 		concurrent:    runtime.GOMAXPROCS(0),
 		crc:           true,
 		single:        nil,
@@ -37,20 +37,31 @@ func (o *encoderOptions) setDefault() {
 		windowSize:    8 << 20,
 		level:         SpeedDefault,
 		allLitEntropy: true,
+		lowMem:        false,
 	}
 }
 
 // encoder returns an encoder with the selected options.
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
+	case SpeedFastest:
+		if o.dict != nil {
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+		if o.dict != nil {
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+		}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
 	case SpeedBetterCompression:
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		if o.dict != nil {
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
 	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
-	case SpeedFastest:
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@@ -62,7 +73,7 @@ func WithEncoderCRC(b bool) EOption {
 }
 
 // WithEncoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
+// meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
@@ -276,6 +287,17 @@ func WithSingleSegment(b bool) EOption {
 	}
 }
 
+// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
+// slower encoding speed.
+// This will not change the window size which is the primary function for reducing
+// memory usage. See WithWindowSize.
+func WithLowerEncoderMem(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.lowMem = b
+		return nil
+	}
+}
+
 // WithEncoderDict allows to register a dictionary that will be used for the encode.
 // The encoder *may* choose to use no dictionary instead for certain payloads.
 func WithEncoderDict(dict []byte) EOption {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fc4a566d3..693c5f05d 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -121,7 +121,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	d.SingleSegment = fhd&(1<<5) != 0
 
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return errors.New("reserved bit set on frame header")
 	}
 
 	// Read Window_Descriptor
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index aa9eba88b..c74681b99 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -97,7 +97,7 @@ func (s *fseEncoder) prepare() (*fseEncoder, error) {
 func (s *fseEncoder) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -202,13 +202,13 @@ func (s *fseEncoder) buildCTable() error {
 			case 0:
 			case -1, 1:
 				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int16(total - 1)
+				symbolTT[i].deltaFindState = total - 1
 				total++
 			default:
 				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
 				minStatePlus := uint32(v) << maxBitsOut
 				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int16(total - v)
+				symbolTT[i].deltaFindState = total - v
 				total += v
 			}
 		}
@@ -353,8 +353,8 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 		distributed  uint32
 		total        = uint32(length)
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -379,7 +379,7 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
@@ -708,7 +708,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
index 6c17dc17f..474cb77d2 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
@@ -59,7 +59,7 @@ func fillBase(dst []baseOffset, base uint32, bits ...uint8) {
 	}
 	for i, bit := range bits {
 		if base > math.MaxInt32 {
-			panic(fmt.Sprintf("invalid decoding table, base overflows int32"))
+			panic("invalid decoding table, base overflows int32")
 		}
 
 		dst[i] = baseOffset{
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index b5c8ef133..1dd39e63b 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -181,11 +181,18 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			return fmt.Errorf("output (%d) bigger than max block size", size)
 		}
 		if size > cap(s.out) {
-			// Not enough size, will be extremely rarely triggered,
+			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
-			// We add maxBlockSize to the capacity.
-			s.out = append(s.out, make([]byte, maxBlockSize)...)
-			s.out = s.out[:len(s.out)-maxBlockSize]
+			// over-allocating here can create a large amount of GC pressure so we try to keep
+			// it as contained as possible
+			used := len(s.out) - startSize
+			addBytes := 256 + ll + ml + used>>2
+			// Clamp to max block size.
+			if used+addBytes > maxBlockSize {
+				addBytes = maxBlockSize - used
+			}
+			s.out = append(s.out, make([]byte, addBytes)...)
+			s.out = s.out[:len(s.out)-addBytes]
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 36bcc3cc0..8014174a7 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -35,7 +35,6 @@ func (s *seqCoders) setPrev(ll, ml, of *fseEncoder) {
 		// Ensure we cannot reuse by accident
 		prevEnc := *prev
 		prevEnc.symbolLen = 0
-		return
 	}
 	compareSwap(ll, &s.llEnc, &s.llPrev)
 	compareSwap(ml, &s.mlEnc, &s.mlPrev)
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 841fd95ac..9d9d1d567 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -10,8 +10,8 @@ import (
 	"hash/crc32"
 	"io"
 
+	"github.com/golang/snappy"
 	"github.com/klauspost/compress/huff0"
-	"github.com/klauspost/compress/snappy"
 )
 
 const (
@@ -185,7 +185,6 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.block.reset(nil)
 				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
 				if err != nil {
-					println("snappy.Decode:", err)
 					return written, err
 				}
 				err = r.block.encodeLits(r.block.literals, false)
@@ -417,7 +416,7 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func snappyCRC(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }
 
 // snappyDecodedLen returns the length of the decoded block and the number of bytes
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
new file mode 100644
index 000000000..e35a0a2f8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -0,0 +1,120 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"errors"
+	"io"
+	"sync"
+)
+
+// ZipMethodWinZip is the method for Zstandard compressed data inside Zip files for WinZip.
+// See https://www.winzip.com/win/en/comp_info.html
+const ZipMethodWinZip = 93
+
+// ZipMethodPKWare is the method number used by PKWARE to indicate Zstandard compression.
+// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.7.TXT
+const ZipMethodPKWare = 20
+
+var zipReaderPool sync.Pool
+
+// newZipReader cannot be used since we would leak goroutines...
+func newZipReader(r io.Reader) io.ReadCloser {
+	dec, ok := zipReaderPool.Get().(*Decoder)
+	if ok {
+		dec.Reset(r)
+	} else {
+		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
+		if err != nil {
+			panic(err)
+		}
+		dec = d
+	}
+	return &pooledZipReader{dec: dec}
+}
+
+type pooledZipReader struct {
+	mu  sync.Mutex // guards Close and Read
+	dec *Decoder
+}
+
+func (r *pooledZipReader) Read(p []byte) (n int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.dec == nil {
+		return 0, errors.New("Read after Close")
+	}
+	dec, err := r.dec.Read(p)
+
+	return dec, err
+}
+
+func (r *pooledZipReader) Close() error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	var err error
+	if r.dec != nil {
+		err = r.dec.Reset(nil)
+		zipReaderPool.Put(r.dec)
+		r.dec = nil
+	}
+	return err
+}
+
+type pooledZipWriter struct {
+	mu  sync.Mutex // guards Close and Read
+	enc *Encoder
+}
+
+func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.enc == nil {
+		return 0, errors.New("Write after Close")
+	}
+	return w.enc.Write(p)
+}
+
+func (w *pooledZipWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	var err error
+	if w.enc != nil {
+		err = w.enc.Close()
+		zipReaderPool.Put(w.enc)
+		w.enc = nil
+	}
+	return err
+}
+
+// ZipCompressor returns a compressor that can be registered with zip libraries.
+// The provided encoder options will be used on all encodes.
+func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
+	var pool sync.Pool
+	return func(w io.Writer) (io.WriteCloser, error) {
+		enc, ok := pool.Get().(*Encoder)
+		if ok {
+			enc.Reset(w)
+		} else {
+			var err error
+			enc, err = NewWriter(w, opts...)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return &pooledZipWriter{enc: enc}, nil
+	}
+}
+
+// ZipDecompressor returns a decompressor that can be registered with zip libraries.
+// See ZipCompressor for example.
+func ZipDecompressor() func(r io.Reader) io.ReadCloser {
+	return func(r io.Reader) io.ReadCloser {
+		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
+		if err != nil {
+			panic(err)
+		}
+		return d.IOReadCloser()
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 0807719c8..1ba308c8b 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -4,6 +4,8 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"log"
 	"math"
@@ -73,6 +75,10 @@ var (
 	// ErrDecoderClosed will be returned if the Decoder was used after
 	// Close has been called.
 	ErrDecoderClosed = errors.New("decoder used after Close")
+
+	// ErrDecoderNilInput is returned when a nil Reader was provided
+	// and an operation other than Reset/DecodeAll/Close was attempted.
+	ErrDecoderNilInput = errors.New("nil input provided as reader")
 )
 
 func println(a ...interface{}) {
@@ -121,24 +127,20 @@ func matchLen(a, b []byte) int {
 }
 
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+type byter interface {
+	Bytes() []byte
+	Len() int
 }
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 56819b36e..2564589bf 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -30,6 +30,8 @@ github.com/fatih/color
 github.com/fsnotify/fsnotify
 # github.com/golang/protobuf v1.3.5
 github.com/golang/protobuf/proto
+# github.com/golang/snappy v0.0.3
+github.com/golang/snappy
 # github.com/hashicorp/hcl v1.0.0
 github.com/hashicorp/hcl
 github.com/hashicorp/hcl/hcl/ast
@@ -47,7 +49,7 @@ github.com/hinshun/vt10x
 # github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e
 ## explicit
 github.com/hokaccha/go-prettyjson
-# github.com/honeycombio/libhoney-go v1.15.2
+# github.com/honeycombio/libhoney-go v1.15.3
 ## explicit
 github.com/honeycombio/libhoney-go
 github.com/honeycombio/libhoney-go/transmission
@@ -55,10 +57,9 @@ github.com/honeycombio/libhoney-go/transmission
 github.com/inconshreveable/mousetrap
 # github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
 github.com/kballard/go-shellquote
-# github.com/klauspost/compress v1.11.4
+# github.com/klauspost/compress v1.12.2
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/huff0
-github.com/klauspost/compress/snappy
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash
 # github.com/kr/pty v1.1.8 => github.com/creack/pty v1.1.7