diff --git a/go.mod b/go.mod index 9fb750c78..8b7317ad4 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/hashicorp/terraform-json v0.14.0 github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e - github.com/honeycombio/libhoney-go v1.16.0 + github.com/honeycombio/libhoney-go v1.17.0 github.com/imdario/mergo v0.3.13 github.com/kyokomi/emoji/v2 v2.2.10 github.com/mitchellh/go-homedir v1.1.0 @@ -54,7 +54,7 @@ require ( github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect - github.com/klauspost/compress v1.15.7 // indirect + github.com/klauspost/compress v1.15.9 // indirect github.com/kr/pty v1.1.8 // indirect github.com/magiconair/properties v1.8.6 // indirect github.com/mattn/go-colorable v0.1.12 // indirect diff --git a/go.sum b/go.sum index 9715445f3..732bda9c5 100644 --- a/go.sum +++ b/go.sum @@ -214,8 +214,8 @@ github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c h1:kp3AxgXgDOmIJFR7b github.com/hinshun/vt10x v0.0.0-20180809195222-d55458df857c/go.mod h1:DqJ97dSdRW1W22yXSB90986pcOyQ7r45iio1KN2ez1A= github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e h1:0aewS5NTyxftZHSnFaJmWE5oCCrj4DyEXkAiMa1iZJM= github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e/go.mod h1:pFlLw2CfqZiIBOx6BuCeRLCrfxBJipTY0nIOF/VbGcI= -github.com/honeycombio/libhoney-go v1.16.0 h1:kPpqoz6vbOzgp7jC6SR7SkNj7rua7rgxvznI6M3KdHc= -github.com/honeycombio/libhoney-go v1.16.0/go.mod h1:izP4fbREuZ3vqC4HlCAmPrcPT9gxyxejRjGtCYpmBn0= +github.com/honeycombio/libhoney-go v1.17.0 h1:e/s52AcMytDCIobAfThj82mWqSYdPV+djhCs4dKxxnU= +github.com/honeycombio/libhoney-go v1.17.0/go.mod h1:KwbcXkqUbH20x3MpfSt/kdvlog3FFdEnouqYD3XKXLY= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= @@ -233,8 +233,8 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:C github.com/kevinburke/ssh_config v0.0.0-20201106050909-4977a11b4351 h1:DowS9hvgyYSX4TO5NpyC606/Z4SxnNYbT+WX27or6Ck= github.com/kevinburke/ssh_config v0.0.0-20201106050909-4977a11b4351/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.15.7 h1:7cgTQxJCU/vy+oP/E3B9RGbQTgbiVzIJWIKOLoAsPok= -github.com/klauspost/compress v1.15.7/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= +github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY= +github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= diff --git a/vendor/github.com/honeycombio/libhoney-go/.gitignore b/vendor/github.com/honeycombio/libhoney-go/.gitignore index daf913b1b..826b5e56c 100644 --- a/vendor/github.com/honeycombio/libhoney-go/.gitignore +++ b/vendor/github.com/honeycombio/libhoney-go/.gitignore @@ -1,3 +1,6 @@ +# Example artifacts +examples/wiki-manual-tracing/*.txt + # Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a diff --git a/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md b/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md index a1ca7d385..085e421a8 100644 --- a/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md +++ b/vendor/github.com/honeycombio/libhoney-go/CHANGELOG.md @@ -1,5 +1,17 @@ # libhoney Changelog +## 1.17.0 2022-09-23 + +### Enhancements + +- feat: adds a configurable event batch send timeout (#190) | [@robbkidd](https://github.com/robbkidd) + +### Maintenance + +- maint: add go 1.19 to CI (#189) | [@vreynolds](https://github.com/vreynolds) +- docs: add wiki manual tracing example (#188) | [@vreynolds](https://github.com/vreynolds) +- Bump github.com/klauspost/compress from 1.15.7 to 1.15.9 (#192) | [@robbkidd](https://github.com/robbkidd) + ## 1.16.0 2022-07-13 There were several v2 releases that were unusable because they were incomplete according to Go's semantic versioning strategy. diff --git a/vendor/github.com/honeycombio/libhoney-go/libhoney.go b/vendor/github.com/honeycombio/libhoney-go/libhoney.go index 40176d606..083ea11ef 100644 --- a/vendor/github.com/honeycombio/libhoney-go/libhoney.go +++ b/vendor/github.com/honeycombio/libhoney-go/libhoney.go @@ -35,7 +35,7 @@ const ( defaultAPIHost = "https://api.honeycomb.io/" defaultClassicDataset = "libhoney-go dataset" defaultDataset = "unknown_dataset" - version = "1.16.0" + version = "1.17.0" // DefaultMaxBatchSize how many events to collect in a batch DefaultMaxBatchSize = 50 diff --git a/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go b/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go index 667e1069b..572b98b48 100644 --- a/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go +++ b/vendor/github.com/honeycombio/libhoney-go/transmission/transmission.go @@ -30,21 +30,44 @@ import ( ) const ( - apiMaxBatchSize int = 5000000 // 5MB - apiEventSizeMax int = 100000 // 100KB + // Size limit for a serialized request body sent for a batch. + apiMaxBatchSize int = 5000000 // 5MB + // Size limit for a single serialized event within a batch. + apiEventSizeMax int = 100000 // 100KB maxOverflowBatches int = 10 + // Default start-to-finish timeout for batch send HTTP requests. + defaultSendTimeout = time.Second * 60 ) // Version is the build version, set by libhoney var Version string type Honeycomb struct { - // how many events to collect into a batch before sending + // How many events to collect into a batch before sending. A + // batch could be sent before achieving this item limit if the + // BatchTimeout has elapsed since the last batch send. If set + // to zero, batches will only be sent upon reaching the + // BatchTimeout. It is an error for both this and + // the BatchTimeout to be zero. + // Default: 50 (from Config.MaxBatchSize) MaxBatchSize uint - // how often to send off batches + // How often to send batches. Events queue up into a batch until + // this time has elapsed or the batch item limit is reached + // (MaxBatchSize), then the batch is sent to Honeycomb API. + // If set to zero, batches will only be sent upon reaching the + // MaxBatchSize item limit. It is an error for both this and + // the MaxBatchSize to be zero. + // Default: 100 milliseconds (from Config.SendFrequency) BatchTimeout time.Duration + // The start-to-finish timeout for HTTP requests sending event + // batches to the Honeycomb API. Transmission will retry once + // when receiving a timeout, so total time spent attempting to + // send events could be twice this value. + // Default: 60 seconds. + BatchSendTimeout time.Duration + // how many batches can be inflight simultaneously MaxConcurrentBatches uint @@ -73,6 +96,10 @@ type Honeycomb struct { batchMaker func() muster.Batch responses chan Response + // Transport defines the behavior of the lower layer transport details. + // It is used as the Transport value for the constructed HTTP client that + // sends batches of events. + // Default: http.DefaultTransport Transport http.RoundTripper muster *muster.Client @@ -91,6 +118,9 @@ func (h *Honeycomb) Start() error { if h.Metrics == nil { h.Metrics = &nullMetrics{} } + if h.BatchSendTimeout == 0 { + h.BatchSendTimeout = defaultSendTimeout + } if h.batchMaker == nil { h.batchMaker = func() muster.Batch { return &batchAgg{ @@ -98,7 +128,7 @@ func (h *Honeycomb) Start() error { batches: map[string][]*Event{}, httpClient: &http.Client{ Transport: h.Transport, - Timeout: 60 * time.Second, + Timeout: h.BatchSendTimeout, }, blockOnResponse: h.BlockOnResponse, responses: h.responses, diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md index 1f6b7e9a2..ad5c63a82 100644 --- a/vendor/github.com/klauspost/compress/README.md +++ b/vendor/github.com/klauspost/compress/README.md @@ -17,6 +17,25 @@ This package provides various compression algorithms. # changelog +* July 13, 2022 (v1.15.8) + + * gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641 + * s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638 + * zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636 + * zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637 + * huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634 + * zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640 + * gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639 + +* June 29, 2022 (v1.15.7) + + * s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633 + * zip: Merge upstream https://github.com/klauspost/compress/pull/631 + * zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624 + * zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598 + * flate: Faster histograms https://github.com/klauspost/compress/pull/620 + * deflate: Use compound hcode https://github.com/klauspost/compress/pull/622 + * June 3, 2022 (v1.15.6) * s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613 * s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611 diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go index 671e630a8..9f3e9f79e 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go @@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) const fallback8BitSize = 800 type decompress4xContext struct { - pbr0 *bitReaderShifted - pbr1 *bitReaderShifted - pbr2 *bitReaderShifted - pbr3 *bitReaderShifted + pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int @@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { ctx := decompress4xContext{ - pbr0: &br[0], - pbr1: &br[1], - pbr2: &br[2], - pbr3: &br[3], + pbr: &br, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &out[0], dstEvery: dstEvery, diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s index 6c65c6e2b..dd1a5aecd 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s @@ -4,45 +4,40 @@ // +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_main_loop_amd64(SB), $8-8 +TEXT ·decompress4x_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX - MOVBQZX 32(AX), SI - MOVQ 40(AX), DI - MOVQ DI, BX - MOVQ 72(AX), CX - MOVQ CX, (SP) - MOVQ 48(AX), R8 - MOVQ 56(AX), R9 - MOVQ (AX), R10 - MOVQ 8(AX), R11 - MOVQ 16(AX), R12 - MOVQ 24(AX), R13 + MOVBQZX 8(AX), DI + MOVQ 16(AX), SI + MOVQ 48(AX), BX + MOVQ 24(AX), R9 + MOVQ 32(AX), R10 + MOVQ (AX), R11 // Main loop main_loop: - MOVQ BX, DI - CMPQ DI, (SP) + MOVQ SI, R8 + CMPQ R8, BX SETGE DL // br0.fillFast32() - MOVQ 32(R10), R14 - MOVBQZX 40(R10), R15 - CMPQ R15, $0x20 + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 JBE skip_fill0 - MOVQ 24(R10), AX - SUBQ $0x20, R15 + MOVQ 24(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R10), BP + MOVQ (R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R10) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 24(R11) + ORQ R14, R12 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 @@ -51,57 +46,57 @@ main_loop: skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R10) - MOVB R15, 40(R10) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 // br1.fillFast32() - MOVQ 32(R11), R14 - MOVBQZX 40(R11), R15 - CMPQ R15, $0x20 + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 JBE skip_fill1 - MOVQ 24(R11), AX - SUBQ $0x20, R15 + MOVQ 72(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R11), BP + MOVQ 48(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R11) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 72(R11) + ORQ R14, R12 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 @@ -110,57 +105,57 @@ skip_fill0: skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R11) - MOVB R15, 40(R11) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 // br2.fillFast32() - MOVQ 32(R12), R14 - MOVBQZX 40(R12), R15 - CMPQ R15, $0x20 + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 JBE skip_fill2 - MOVQ 24(R12), AX - SUBQ $0x20, R15 + MOVQ 120(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R12), BP + MOVQ 96(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R12) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 120(R11) + ORQ R14, R12 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 @@ -169,57 +164,57 @@ skip_fill1: skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R12) - MOVB R15, 40(R12) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 // br3.fillFast32() - MOVQ 32(R13), R14 - MOVBQZX 40(R13), R15 - CMPQ R15, $0x20 + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 JBE skip_fill3 - MOVQ 24(R13), AX - SUBQ $0x20, R15 + MOVQ 168(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R13), BP + MOVQ 144(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R13) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 168(R11) + ORQ R14, R12 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 @@ -228,149 +223,142 @@ skip_fill2: skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R13) - MOVB R15, 40(R13) - ADDQ $0x02, BX + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x02, SI TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ BX, DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), SI + SHLQ $0x02, SI + MOVQ SI, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 +TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX - MOVBQZX 32(CX), BX - MOVQ 40(CX), SI - MOVQ SI, (SP) - MOVQ 72(CX), DX - MOVQ DX, 8(SP) - MOVQ 48(CX), DI - MOVQ 56(CX), R8 - MOVQ (CX), R9 - MOVQ 8(CX), R10 - MOVQ 16(CX), R11 - MOVQ 24(CX), R12 + MOVBQZX 8(CX), DI + MOVQ 16(CX), BX + MOVQ 48(CX), SI + MOVQ 24(CX), R9 + MOVQ 32(CX), R10 + MOVQ (CX), R11 // Main loop main_loop: - MOVQ (SP), SI - CMPQ SI, 8(SP) + MOVQ BX, R8 + CMPQ R8, SI SETGE DL - // br1000.fillFast32() - MOVQ 32(R9), R13 - MOVBQZX 40(R9), R14 - CMPQ R14, $0x20 - JBE skip_fill1000 - MOVQ 24(R9), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R9), BP + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ (R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R9) - ORQ BP, R13 - - // exhausted = exhausted || (br1000.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 24(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1000: +skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -378,88 +366,88 @@ skip_fill1000: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R9) - MOVB R14, 40(R9) - ADDQ DI, SI - - // br1001.fillFast32() - MOVQ 32(R10), R13 - MOVBQZX 40(R10), R14 - CMPQ R14, $0x20 - JBE skip_fill1001 - MOVQ 24(R10), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R10), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 48(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R10) - ORQ BP, R13 - - // exhausted = exhausted || (br1001.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 72(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1001: +skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -467,88 +455,88 @@ skip_fill1001: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R10) - MOVB R14, 40(R10) - ADDQ DI, SI - - // br1002.fillFast32() - MOVQ 32(R11), R13 - MOVBQZX 40(R11), R14 - CMPQ R14, $0x20 - JBE skip_fill1002 - MOVQ 24(R11), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R11), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 96(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R11) - ORQ BP, R13 - - // exhausted = exhausted || (br1002.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 120(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1002: +skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -556,88 +544,88 @@ skip_fill1002: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R11) - MOVB R14, 40(R11) - ADDQ DI, SI - - // br1003.fillFast32() - MOVQ 32(R12), R13 - MOVBQZX 40(R12), R14 - CMPQ R14, $0x20 - JBE skip_fill1003 - MOVQ 24(R12), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R12), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 144(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R12) - ORQ BP, R13 - - // exhausted = exhausted || (br1003.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 168(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1003: +skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -645,20 +633,18 @@ skip_fill1003: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) + MOVL AX, (R8) - // update the bitreader reader structure - MOVQ R13, 32(R12) - MOVB R14, 40(R12) - ADDQ $0x04, (SP) + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ (SP), DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) @@ -750,10 +736,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) @@ -847,10 +831,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go index 4493baa75..2ad02070d 100644 --- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go +++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go @@ -23,7 +23,7 @@ type byteBuffer interface { readByte() (byte, error) // Skip n bytes. - skipN(n int) error + skipN(n int64) error } // in-memory buffer @@ -62,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) { return r, nil } -func (b *byteBuf) skipN(n int) error { +func (b *byteBuf) skipN(n int64) error { bb := *b - if len(bb) < n { + if n < 0 { + return fmt.Errorf("negative skip (%d) requested", n) + } + if int64(len(bb)) < n { return io.ErrUnexpectedEOF } *b = bb[n:] @@ -120,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) { return r.tmp[0], nil } -func (r *readerWrapper) skipN(n int) error { - n2, err := io.CopyN(ioutil.Discard, r.r, int64(n)) - if n2 != int64(n) { +func (r *readerWrapper) skipN(n int64) error { + n2, err := io.CopyN(ioutil.Discard, r.r, n) + if n2 != n { err = io.ErrUnexpectedEOF } return err diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go index 286c8f9d7..d212f4737 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder.go @@ -348,6 +348,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { frame.history.setDict(&dict) } if frame.WindowSize > d.o.maxWindowSize { + if debugDecoder { + println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize) + } return dst, ErrWindowSizeExceeded } if frame.FrameContentSize != fcsUnknown { diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go index e6b1d01cf..7aaaedb23 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder.go @@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { // If a non-single block is needed the encoder will reset again. e.encoders <- enc }() - // Use single segments when above minimum window and below 1MB. - single := len(src) < 1<<20 && len(src) > MinWindowSize + // Use single segments when above minimum window and below window size. + single := len(src) <= e.o.windowSize && len(src) > MinWindowSize if e.o.single != nil { single = *e.o.single } diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go index 44d8dbd19..a7c5e1aac 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go @@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption { // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range. // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB. // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations. -// If this is not specified, block encodes will automatically choose this based on the input size. +// If this is not specified, block encodes will automatically choose this based on the input size and the window size. // This setting has no effect on streamed encodes. func WithSingleSegment(b bool) EOption { return func(o *encoderOptions) error { diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go index fa0a633f3..9568a4ba3 100644 --- a/vendor/github.com/klauspost/compress/zstd/framedec.go +++ b/vendor/github.com/klauspost/compress/zstd/framedec.go @@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error { } n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24) println("Skipping frame with", n, "bytes.") - err = br.skipN(int(n)) + err = br.skipN(int64(n)) if err != nil { if debugDecoder { println("Reading discarded frame", err) @@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error { d.crc.Reset() } + if d.WindowSize > d.o.maxWindowSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrWindowSizeExceeded + } + if d.WindowSize == 0 && d.SingleSegment { // We may not need window in this case. d.WindowSize = d.FrameContentSize if d.WindowSize < MinWindowSize { d.WindowSize = MinWindowSize } - } - - if d.WindowSize > uint64(d.o.maxWindowSize) { - if debugDecoder { - printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + if d.WindowSize > d.o.maxDecodedSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrDecoderSizeExceeded } - return ErrWindowSizeExceeded } + // The minimum Window_Size is 1 KB. if d.WindowSize < MinWindowSize { if debugDecoder { diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go index e74df436c..c881d28d8 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go @@ -34,8 +34,8 @@ const ( // buildDtable will build the decoding table. func (s *fseDecoder) buildDtable() error { ctx := buildDtableAsmContext{ - stateTable: (*uint16)(&s.stateTable[0]), - norm: (*int16)(&s.norm[0]), + stateTable: &s.stateTable[0], + norm: &s.norm[0], dt: (*uint64)(&s.dt[0]), } code := buildDtable_asm(s, &ctx) diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go index 847b322ae..7598c1018 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go @@ -55,16 +55,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize { return false, nil } - useSafe := false - if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { - useSafe = true - } - if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { - useSafe = true - } - if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { - useSafe = true - } + + // FIXME: Using unsafe memory copies leads to rare, random crashes + // with fuzz testing. It is therefore disabled for now. + const useSafe = true + /* + useSafe := false + if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { + useSafe = true + } + if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { + useSafe = true + } + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + useSafe = true + } + */ br := s.br diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 212c6cac3..27e76774c 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -52,34 +52,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte: sequenceDecs_decode_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 16(R10) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_of_update_zero: + MOVQ AX, 16(R10) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 8(R10) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ml_update_zero: + MOVQ AX, 8(R10) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -107,19 +119,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte: sequenceDecs_decode_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, (R10) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ll_update_zero: + MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) @@ -134,18 +152,17 @@ sequenceDecs_decode_amd64_fill_2_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -155,18 +172,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -176,18 +192,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -201,7 +216,7 @@ sequenceDecs_decode_amd64_skip_update: MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 - JMP sequenceDecs_decode_amd64_adjust_end + JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 @@ -213,7 +228,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero MOVQ R11, CX - JMP sequenceDecs_decode_amd64_adjust_end + JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -250,7 +265,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid: MOVQ AX, R11 MOVQ AX, CX -sequenceDecs_decode_amd64_adjust_end: +sequenceDecs_decode_amd64_after_adjust: MOVQ CX, 16(R10) // Check values @@ -359,49 +374,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte: sequenceDecs_decode_56_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 16(R10) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_of_update_zero: + MOVQ AX, 16(R10) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 8(R10) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ml_update_zero: + MOVQ AX, 8(R10) // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, (R10) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ll_update_zero: + MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) @@ -416,18 +449,17 @@ sequenceDecs_decode_56_amd64_fill_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -437,18 +469,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -458,18 +489,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -483,7 +513,7 @@ sequenceDecs_decode_56_amd64_skip_update: MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 - JMP sequenceDecs_decode_56_amd64_adjust_end + JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 @@ -495,7 +525,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero MOVQ R11, CX - JMP sequenceDecs_decode_56_amd64_adjust_end + JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -532,7 +562,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid: MOVQ AX, R11 MOVQ AX, CX -sequenceDecs_decode_56_amd64_adjust_end: +sequenceDecs_decode_56_amd64_after_adjust: MOVQ CX, 16(R10) // Check values @@ -763,7 +793,7 @@ sequenceDecs_decode_bmi2_skip_update: MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 - JMP sequenceDecs_decode_bmi2_adjust_end + JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 @@ -775,7 +805,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero MOVQ R10, CX - JMP sequenceDecs_decode_bmi2_adjust_end + JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -812,7 +842,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid: MOVQ R13, R10 MOVQ R13, CX -sequenceDecs_decode_bmi2_adjust_end: +sequenceDecs_decode_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values @@ -1018,7 +1048,7 @@ sequenceDecs_decode_56_bmi2_skip_update: MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 - JMP sequenceDecs_decode_56_bmi2_adjust_end + JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 @@ -1030,7 +1060,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero MOVQ R10, CX - JMP sequenceDecs_decode_56_bmi2_adjust_end + JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -1067,7 +1097,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid: MOVQ R13, R10 MOVQ R13, CX -sequenceDecs_decode_56_bmi2_adjust_end: +sequenceDecs_decode_56_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values @@ -1181,52 +1211,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1234,53 +1277,74 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -1382,45 +1446,67 @@ main_loop: // Copy literals TESTQ R11, R11 JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, R11 - JZ copy_1_word - MOVB (SI)(R14*1), R15 - MOVB R15, (BX)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, R11 - JZ copy_1_dword - MOVW (SI)(R14*1), R15 - MOVW R15, (BX)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, R11 - JZ copy_1_qword - MOVL (SI)(R14*1), R15 - MOVL R15, (BX)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, R11 - JZ copy_1_test - MOVQ (SI)(R14*1), R15 - MOVQ R15, (BX)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1_test: - CMPQ R14, R11 - JB copy_1 +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + +copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -1432,52 +1518,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1485,99 +1584,143 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (R11)(R12*1), R14 - MOVW R14, (BX)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (R11)(R12*1), R14 - MOVL R14, (BX)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (R11)(R12*1), R14 - MOVQ R14, (BX)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 ADDQ R13, BX - JMP handle_loop + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1642,6 +1785,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) @@ -1691,34 +1838,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte: sequenceDecs_decodeSync_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 8(SP) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_of_update_zero: + MOVQ AX, 8(SP) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 16(SP) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ml_update_zero: + MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -1746,19 +1905,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: sequenceDecs_decodeSync_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 24(SP) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ll_update_zero: + MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) @@ -1773,18 +1938,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -1794,18 +1958,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -1815,18 +1978,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1841,7 +2003,7 @@ sequenceDecs_decodeSync_amd64_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_amd64_adjust_end + JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -1853,7 +2015,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_amd64_adjust_end + JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: MOVQ R13, AX @@ -1862,8 +2024,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(AX*8), R14 + ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid MOVQ $0x00000001, R14 @@ -1879,7 +2040,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_amd64_adjust_end: +sequenceDecs_decodeSync_amd64_after_adjust: MOVQ R13, 8(SP) // Check values @@ -1934,103 +2095,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -2142,6 +2337,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) @@ -2314,7 +2513,7 @@ sequenceDecs_decodeSync_bmi2_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_bmi2_adjust_end + JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -2326,7 +2525,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_bmi2_adjust_end + JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: MOVQ R13, R12 @@ -2335,8 +2534,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(R12*8), R14 + ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 @@ -2352,7 +2550,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_bmi2_adjust_end: +sequenceDecs_decodeSync_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values @@ -2407,103 +2605,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 @@ -2615,6 +2847,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) @@ -2664,34 +2900,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: sequenceDecs_decodeSync_safe_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 8(SP) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_of_update_zero: + MOVQ AX, 8(SP) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 16(SP) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ml_update_zero: + MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -2719,19 +2967,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: sequenceDecs_decodeSync_safe_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 24(SP) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ll_update_zero: + MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) @@ -2746,18 +3000,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -2767,18 +3020,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -2788,18 +3040,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -2814,7 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -2826,7 +3077,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: MOVQ R13, AX @@ -2835,8 +3086,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(AX*8), R14 + ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid MOVQ $0x00000001, R14 @@ -2852,7 +3102,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_safe_amd64_adjust_end: +sequenceDecs_decodeSync_safe_amd64_after_adjust: MOVQ R13, 8(SP) // Check values @@ -2885,45 +3135,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: // Copy literals TESTQ AX, AX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_test - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end -copy_1_test: - CMPQ R14, AX - JB copy_1 +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 + +copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -2936,149 +3208,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R12 - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - ADDQ $0x01, CX - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (AX)(CX*1), R14 - MOVW R14, (R10)(CX*1) - ADDQ $0x02, CX - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (AX)(CX*1), R14 - MOVL R14, (R10)(CX*1) - ADDQ $0x04, CX - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (AX)(CX*1), R14 - MOVQ R14, (R10)(CX*1) - ADDQ $0x08, CX - JMP copy_2_test - -copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) - ADDQ $0x10, CX + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small -copy_2_test: - CMPQ CX, R13 - JB copy_2 +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX ADDQ R13, R10 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -3172,6 +3501,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) @@ -3344,7 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -3356,7 +3689,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: MOVQ R13, R12 @@ -3365,8 +3698,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(R12*8), R14 + ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 @@ -3382,7 +3714,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_safe_bmi2_adjust_end: +sequenceDecs_decodeSync_safe_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values @@ -3415,45 +3747,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: // Copy literals TESTQ CX, CX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_test - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1_test: - CMPQ R14, CX - JB copy_1 +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 + +copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -3466,149 +3820,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R11 - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (CX)(R12*1), R14 - MOVW R14, (R9)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (CX)(R12*1), R14 - MOVL R14, (R9)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (CX)(R12*1), R14 - MOVQ R14, (R9)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test - -copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) - ADDQ $0x10, R12 + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX ADDQ R13, R9 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: diff --git a/vendor/modules.txt b/vendor/modules.txt index 94d806aa5..6387f2331 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -111,7 +111,7 @@ github.com/hinshun/vt10x # github.com/hokaccha/go-prettyjson v0.0.0-20190818114111-108c894c2c0e ## explicit github.com/hokaccha/go-prettyjson -# github.com/honeycombio/libhoney-go v1.16.0 +# github.com/honeycombio/libhoney-go v1.17.0 ## explicit; go 1.14 github.com/honeycombio/libhoney-go github.com/honeycombio/libhoney-go/transmission @@ -124,7 +124,7 @@ github.com/inconshreveable/mousetrap # github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 ## explicit github.com/kballard/go-shellquote -# github.com/klauspost/compress v1.15.7 +# github.com/klauspost/compress v1.15.9 ## explicit; go 1.16 github.com/klauspost/compress github.com/klauspost/compress/fse