Skip to content

Commit 6c809ac

Browse files
authored
zstd: Asm decoder tweaks (#537)
* Add non-bmi amd64 tests * Use BEXTRQ for extracting shifted values. * Move 0 check into getBits. * Remove ctx alloc. Sequences only, BMI: ``` benchmark old ns/op new ns/op delta Benchmark_seqdec_decode/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 91657 91114 -0.59% Benchmark_seqdec_decode/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 92392 90416 -2.14% Benchmark_seqdec_decode/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 83022 79745 -3.95% Benchmark_seqdec_decode/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 9149 8856 -3.20% Benchmark_seqdec_decode/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 22402 22102 -1.34% Benchmark_seqdec_decode/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 60844 60114 -1.20% Benchmark_seqdec_decode/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 5785 5879 +1.62% Benchmark_seqdec_decode/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 118030 115597 -2.06% Benchmark_seqdec_decode/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 135 64.3 -52.35% Benchmark_seqdec_decode/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 648 589 -9.03% Benchmark_seqdec_decode/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 5555 5467 -1.58% Benchmark_seqdec_decode/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 17896 17605 -1.63% Benchmark_seqdec_decode/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 27457 27232 -0.82% Benchmark_seqdec_decode/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 59341 58158 -1.99% ``` No BMI: ``` benchmark old ns/op new ns/op delta Benchmark_seqdec_decodeNoBMI/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 114889 113333 -1.35% Benchmark_seqdec_decodeNoBMI/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 121269 119500 -1.46% Benchmark_seqdec_decodeNoBMI/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 106986 102585 -4.11% Benchmark_seqdec_decodeNoBMI/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 10910 10304 -5.55% Benchmark_seqdec_decodeNoBMI/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 25965 24642 -5.10% Benchmark_seqdec_decodeNoBMI/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 80183 77980 -2.75% Benchmark_seqdec_decodeNoBMI/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 6702 6369 -4.97% Benchmark_seqdec_decodeNoBMI/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 151867 148752 -2.05% Benchmark_seqdec_decodeNoBMI/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 139 46.8 -66.31% Benchmark_seqdec_decodeNoBMI/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 744 609 -18.13% Benchmark_seqdec_decodeNoBMI/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 6570 6083 -7.41% Benchmark_seqdec_decodeNoBMI/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 20448 19955 -2.41% Benchmark_seqdec_decodeNoBMI/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 34177 32790 -4.06% Benchmark_seqdec_decodeNoBMI/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 77864 75628 -2.87% ```
1 parent 2d457e5 commit 6c809ac

File tree

6 files changed

+227
-63
lines changed

6 files changed

+227
-63
lines changed

internal/cpuinfo/cpuinfo.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@ func HasBMI2() bool {
1515
return hasBMI2
1616
}
1717

18+
// DisableBMI2 will disable BMI2, for testing purposes.
19+
// Call returned function to restore previous state.
20+
func DisableBMI2() func() {
21+
old := hasBMI2
22+
hasBMI2 = false
23+
return func() {
24+
hasBMI2 = old
25+
}
26+
}
27+
1828
// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
1929
func HasBMI() bool {
2030
return HasBMI1() && HasBMI2()

zstd/_generate/gen.go

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,16 @@ func (o options) genDecodeSeqAsm(name string) {
165165
}
166166

167167
R14 := GP64()
168-
MOVQ(ofState, R14) // copy ofState, its current value is needed below
168+
if o.bmi2 {
169+
tmp := GP64()
170+
MOVQ(U32(8|(8<<8)), tmp)
171+
BEXTRQ(tmp, ofState, R14)
172+
} else {
173+
MOVQ(ofState, R14) // copy ofState, its current value is needed below
174+
SHRQ(U8(8), R14) // moB (from the ofState before its update)
175+
MOVBQZX(R14.As8(), R14)
176+
}
177+
169178
// Reload ctx
170179
ctx := Dereference(Param("ctx"))
171180
iteration, err := ctx.Field("iteration").Resolve()
@@ -188,8 +197,6 @@ func (o options) genDecodeSeqAsm(name string) {
188197
Label(name + "_skip_update")
189198

190199
// mo = s.adjustOffset(mo, ll, moB)
191-
SHRQ(U8(8), R14) // moB (from the ofState before its update)
192-
MOVBQZX(R14.As8(), R14)
193200

194201
Comment("Adjust offset")
195202

@@ -373,27 +380,27 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
373380
})
374381

375382
DX := GP64()
376-
MOVQ(state, DX) // TODO: maybe use BEXTR?
377-
SHRQ(U8(16), DX)
378-
MOVWQZX(DX.As16(), DX)
379-
380-
if !o.bmi2 {
381-
// TODO: Probably reasonable to kip if AX==0s
382-
CMPQ(AX, U8(0))
383-
JZ(LabelRef(name + "_skip"))
383+
if o.bmi2 {
384+
tmp := GP64()
385+
MOVQ(U32(16|(16<<8)), tmp)
386+
BEXTRQ(tmp, state, DX)
387+
} else {
388+
MOVQ(state, DX)
389+
SHRQ(U8(16), DX)
390+
MOVWQZX(DX.As16(), DX)
384391
}
385392

386393
{
387-
lowBits := o.getBits(name+"_getBits", AX, brValue, brBitsRead)
394+
lowBits := o.getBits(name+"_getBits", AX, brValue, brBitsRead, LabelRef(name+"_skip_zero"))
388395
// Check if below tablelog
389396
assert(func(ok LabelRef) {
390397
CMPQ(lowBits, U32(512))
391398
JB(ok)
392399
})
393400
ADDQ(lowBits, DX)
401+
Label(name + "_skip_zero")
394402
}
395403

396-
Label(name + "_skip")
397404
// Load table pointer
398405
tablePtr := GP64()
399406
Comment("Load ctx." + table)
@@ -413,7 +420,9 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
413420
MOVQ(Mem{Base: tablePtr, Index: DX, Scale: 8}, state)
414421
}
415422

416-
func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual) reg.GPVirtual {
423+
// getBits will return nbits bits from brValue.
424+
// If nbits == 0 it *may* jump to jmpZero, otherwise 0 is returned.
425+
func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual, jmpZero LabelRef) reg.GPVirtual {
417426
BX := GP64()
418427
CX := reg.CL
419428
if o.bmi2 {
@@ -423,15 +432,15 @@ func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual)
423432
ROLQ(CX, BX)
424433
BZHIQ(nBits, BX, BX)
425434
} else {
435+
CMPQ(nBits, U8(0))
436+
JZ(jmpZero)
426437
MOVQ(brBitsRead, CX.As64())
427438
ADDQ(nBits, brBitsRead)
428439
MOVQ(brValue, BX)
429440
SHLQ(CX, BX)
430441
MOVQ(nBits, CX.As64())
431442
NEGQ(CX.As64())
432443
SHRQ(CX, BX)
433-
TESTQ(nBits, nBits)
434-
CMOVQEQ(nBits, BX)
435444
}
436445
return BX
437446
}

zstd/seqdec_amd64.go

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,23 +30,13 @@ const errorMatchLenTooBig = 2
3030
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
3131
//
3232
// Please refer to seqdec_generic.go for the reference implementation.
33+
//go:noescape
3334
func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
3435

3536
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
37+
//go:noescape
3638
func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
3739

38-
type sequenceDecs_decode_function = func(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
39-
40-
var sequenceDecs_decode sequenceDecs_decode_function
41-
42-
func init() {
43-
if cpuinfo.HasBMI2() {
44-
sequenceDecs_decode = sequenceDecs_decode_bmi2
45-
} else {
46-
sequenceDecs_decode = sequenceDecs_decode_amd64
47-
}
48-
}
49-
5040
// decode sequences from the stream without the provided history.
5141
func (s *sequenceDecs) decode(seqs []seqVals) error {
5242
br := s.br
@@ -70,7 +60,12 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
7060

7161
s.seqSize = 0
7262

73-
errCode := sequenceDecs_decode(s, br, &ctx)
63+
var errCode int
64+
if cpuinfo.HasBMI2() {
65+
errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
66+
} else {
67+
errCode = sequenceDecs_decode_amd64(s, br, &ctx)
68+
}
7469
if errCode != 0 {
7570
i := len(seqs) - ctx.iteration
7671
switch errCode {

zstd/seqdec_amd64.s

Lines changed: 25 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -142,30 +142,30 @@ sequenceDecs_decode_amd64_fill_3_byte_by_byte:
142142
JMP sequenceDecs_decode_amd64_fill_3_byte_by_byte
143143

144144
sequenceDecs_decode_amd64_fill_3_end:
145-
MOVQ R11, (SP)
146-
MOVQ R9, AX
147-
MOVQ ctx+16(FP), CX
148-
CMPQ 96(CX), $0x00
149-
JZ sequenceDecs_decode_amd64_skip_update
145+
MOVQ R11, (SP)
146+
MOVQ R9, AX
147+
SHRQ $0x08, AX
148+
MOVBQZX AL, AX
149+
MOVQ ctx+16(FP), CX
150+
CMPQ 96(CX), $0x00
151+
JZ sequenceDecs_decode_amd64_skip_update
150152

151153
// Update Literal Length State
152154
MOVBQZX DI, R11
153155
SHRQ $0x10, DI
154156
MOVWQZX DI, DI
155157
CMPQ R11, $0x00
156-
JZ sequenceDecs_decode_amd64_llState_updateState_skip
158+
JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero
157159
MOVQ BX, CX
158160
ADDQ R11, BX
159161
MOVQ DX, R12
160162
SHLQ CL, R12
161163
MOVQ R11, CX
162164
NEGQ CX
163165
SHRQ CL, R12
164-
TESTQ R11, R11
165-
CMOVQEQ R11, R12
166166
ADDQ R12, DI
167167

168-
sequenceDecs_decode_amd64_llState_updateState_skip:
168+
sequenceDecs_decode_amd64_llState_updateState_skip_zero:
169169
// Load ctx.llTable
170170
MOVQ ctx+16(FP), CX
171171
MOVQ (CX), CX
@@ -176,19 +176,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip:
176176
SHRQ $0x10, R8
177177
MOVWQZX R8, R8
178178
CMPQ R11, $0x00
179-
JZ sequenceDecs_decode_amd64_mlState_updateState_skip
179+
JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero
180180
MOVQ BX, CX
181181
ADDQ R11, BX
182182
MOVQ DX, R12
183183
SHLQ CL, R12
184184
MOVQ R11, CX
185185
NEGQ CX
186186
SHRQ CL, R12
187-
TESTQ R11, R11
188-
CMOVQEQ R11, R12
189187
ADDQ R12, R8
190188

191-
sequenceDecs_decode_amd64_mlState_updateState_skip:
189+
sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
192190
// Load ctx.mlTable
193191
MOVQ ctx+16(FP), CX
194192
MOVQ 24(CX), CX
@@ -199,28 +197,23 @@ sequenceDecs_decode_amd64_mlState_updateState_skip:
199197
SHRQ $0x10, R9
200198
MOVWQZX R9, R9
201199
CMPQ R11, $0x00
202-
JZ sequenceDecs_decode_amd64_ofState_updateState_skip
200+
JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero
203201
MOVQ BX, CX
204202
ADDQ R11, BX
205203
MOVQ DX, R12
206204
SHLQ CL, R12
207205
MOVQ R11, CX
208206
NEGQ CX
209207
SHRQ CL, R12
210-
TESTQ R11, R11
211-
CMOVQEQ R11, R12
212208
ADDQ R12, R9
213209

214-
sequenceDecs_decode_amd64_ofState_updateState_skip:
210+
sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
215211
// Load ctx.ofTable
216212
MOVQ ctx+16(FP), CX
217213
MOVQ 48(CX), CX
218214
MOVQ (CX)(R9*8), R9
219215

220216
sequenceDecs_decode_amd64_skip_update:
221-
SHRQ $0x08, AX
222-
MOVBQZX AL, AX
223-
224217
// Adjust offset
225218
MOVQ s+0(FP), CX
226219
MOVQ 16(R10), R11
@@ -444,16 +437,17 @@ sequenceDecs_decode_bmi2_fill_3_byte_by_byte:
444437
JMP sequenceDecs_decode_bmi2_fill_3_byte_by_byte
445438

446439
sequenceDecs_decode_bmi2_fill_3_end:
447-
MOVQ R10, (SP)
448-
MOVQ R8, R10
449-
MOVQ ctx+16(FP), CX
450-
CMPQ 96(CX), $0x00
451-
JZ sequenceDecs_decode_bmi2_skip_update
440+
MOVQ R10, (SP)
441+
MOVQ $0x00000808, CX
442+
BEXTRQ CX, R8, R10
443+
MOVQ ctx+16(FP), CX
444+
CMPQ 96(CX), $0x00
445+
JZ sequenceDecs_decode_bmi2_skip_update
452446

453447
// Update Literal Length State
454448
MOVBQZX SI, R11
455-
SHRQ $0x10, SI
456-
MOVWQZX SI, SI
449+
MOVQ $0x00001010, CX
450+
BEXTRQ CX, SI, SI
457451
LEAQ (DX)(R11*1), CX
458452
MOVQ AX, R12
459453
MOVQ CX, DX
@@ -468,8 +462,8 @@ sequenceDecs_decode_bmi2_fill_3_end:
468462

469463
// Update Match Length State
470464
MOVBQZX DI, R11
471-
SHRQ $0x10, DI
472-
MOVWQZX DI, DI
465+
MOVQ $0x00001010, CX
466+
BEXTRQ CX, DI, DI
473467
LEAQ (DX)(R11*1), CX
474468
MOVQ AX, R12
475469
MOVQ CX, DX
@@ -484,8 +478,8 @@ sequenceDecs_decode_bmi2_fill_3_end:
484478

485479
// Update Offset State
486480
MOVBQZX R8, R11
487-
SHRQ $0x10, R8
488-
MOVWQZX R8, R8
481+
MOVQ $0x00001010, CX
482+
BEXTRQ CX, R8, R8
489483
LEAQ (DX)(R11*1), CX
490484
MOVQ AX, R12
491485
MOVQ CX, DX
@@ -499,9 +493,6 @@ sequenceDecs_decode_bmi2_fill_3_end:
499493
MOVQ (CX)(R8*8), R8
500494

501495
sequenceDecs_decode_bmi2_skip_update:
502-
SHRQ $0x08, R10
503-
MOVBQZX R10, R10
504-
505496
// Adjust offset
506497
MOVQ s+0(FP), CX
507498
MOVQ 16(R9), R11

0 commit comments

Comments
 (0)