Skip to content

Commit b0dc663

Browse files
committed
huff0: Pass a single bitReader pointer to asm
This makes the context object smaller and frees up three registers, which we can use to replace the limitPtr and bufferOrigin stack variables. Benchmark results show a tiny win (Go 1.19beta, Core i7-3770K): name old speed new speed delta Decompress1XTable/digits-8 347MB/s ± 0% 347MB/s ± 0% ~ (p=0.650 n=8+10) Decompress1XTable/gettysburg-8 268MB/s ± 0% 268MB/s ± 0% ~ (p=0.400 n=9+9) Decompress1XTable/twain-8 327MB/s ± 0% 327MB/s ± 1% ~ (p=0.339 n=7+9) Decompress1XTable/low-ent.10k-8 385MB/s ± 0% 385MB/s ± 1% ~ (p=0.510 n=9+10) Decompress1XTable/superlow-ent-10k-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.712 n=8+10) Decompress1XTable/crash2-8 17.3MB/s ± 1% 17.3MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XTable/endzerobits-8 52.9MB/s ± 1% 52.4MB/s ± 0% -0.94% (p=0.000 n=10+10) Decompress1XTable/endnonzero-8 11.4MB/s ± 0% 11.4MB/s ± 1% ~ (p=0.343 n=10+10) Decompress1XTable/case1-8 22.0MB/s ± 0% 22.0MB/s ± 0% ~ (p=0.618 n=9+9) Decompress1XTable/case2-8 18.1MB/s ± 0% 18.1MB/s ± 0% ~ (p=0.348 n=9+9) Decompress1XTable/case3-8 19.1MB/s ± 0% 19.1MB/s ± 0% +0.21% (p=0.048 n=10+10) Decompress1XTable/pngdata.001-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.861 n=9+10) Decompress1XTable/normcount2-8 54.3MB/s ± 1% 54.5MB/s ± 1% ~ (p=0.093 n=10+10) Decompress1XNoTable/digits/100-8 279MB/s ± 0% 280MB/s ± 0% +0.30% (p=0.003 n=10+9) Decompress1XNoTable/digits/10000-8 366MB/s ± 0% 365MB/s ± 0% ~ (p=0.113 n=10+9) Decompress1XNoTable/digits/262143-8 347MB/s ± 0% 347MB/s ± 1% ~ (p=0.739 n=10+10) Decompress1XNoTable/gettysburg/100-8 278MB/s ± 1% 277MB/s ± 1% ~ (p=0.676 n=10+9) Decompress1XNoTable/gettysburg/10000-8 363MB/s ± 1% 362MB/s ± 0% -0.50% (p=0.001 n=10+9) Decompress1XNoTable/gettysburg/262143-8 350MB/s ± 0% 347MB/s ± 0% -0.90% (p=0.000 n=10+8) Decompress1XNoTable/twain/100-8 268MB/s ± 0% 267MB/s ± 0% ~ (p=0.384 n=9+8) Decompress1XNoTable/twain/10000-8 363MB/s ± 0% 362MB/s ± 0% -0.32% (p=0.000 n=9+9) Decompress1XNoTable/twain/262143-8 328MB/s ± 0% 329MB/s ± 0% ~ (p=0.063 n=9+10) Decompress1XNoTable/low-ent.10k/100-8 180MB/s ± 0% 181MB/s ± 0% ~ (p=0.225 n=10+10) Decompress1XNoTable/low-ent.10k/10000-8 385MB/s ± 0% 385MB/s ± 0% ~ (p=0.289 n=10+10) Decompress1XNoTable/low-ent.10k/262143-8 389MB/s ± 1% 389MB/s ± 1% ~ (p=0.971 n=10+10) Decompress1XNoTable/superlow-ent-10k/262143-8 389MB/s ± 0% 390MB/s ± 0% +0.27% (p=0.017 n=9+10) Decompress1XNoTable/crash2/100-8 278MB/s ± 0% 279MB/s ± 1% ~ (p=0.163 n=9+10) Decompress1XNoTable/crash2/10000-8 373MB/s ± 1% 373MB/s ± 0% ~ (p=0.370 n=10+8) Decompress1XNoTable/crash2/262143-8 375MB/s ± 0% 375MB/s ± 0% ~ (p=0.604 n=9+10) Decompress1XNoTable/endzerobits/100-8 180MB/s ± 0% 181MB/s ± 0% +0.26% (p=0.005 n=10+9) Decompress1XNoTable/endzerobits/10000-8 384MB/s ± 0% 385MB/s ± 0% ~ (p=0.914 n=8+10) Decompress1XNoTable/endzerobits/262143-8 389MB/s ± 0% 390MB/s ± 0% ~ (p=0.739 n=10+10) Decompress1XNoTable/endnonzero/100-8 180MB/s ± 1% 180MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XNoTable/endnonzero/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.965 n=10+8) Decompress1XNoTable/endnonzero/262143-8 390MB/s ± 0% 390MB/s ± 0% ~ (p=0.633 n=8+10) Decompress1XNoTable/case1/100-8 282MB/s ± 0% 283MB/s ± 0% +0.34% (p=0.005 n=10+10) Decompress1XNoTable/case1/10000-8 372MB/s ± 0% 373MB/s ± 0% ~ (p=0.113 n=9+9) Decompress1XNoTable/case1/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/case2/100-8 274MB/s ± 1% 274MB/s ± 0% ~ (p=0.927 n=10+10) Decompress1XNoTable/case2/10000-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.408 n=10+8) Decompress1XNoTable/case2/262143-8 376MB/s ± 1% 377MB/s ± 0% ~ (p=1.000 n=10+10) Decompress1XNoTable/case3/100-8 266MB/s ± 0% 265MB/s ± 0% ~ (p=0.113 n=9+10) Decompress1XNoTable/case3/10000-8 372MB/s ± 0% 372MB/s ± 0% ~ (p=0.075 n=10+9) Decompress1XNoTable/case3/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.172 n=10+10) Decompress1XNoTable/pngdata.001/100-8 238MB/s ± 0% 238MB/s ± 0% ~ (p=0.438 n=9+8) Decompress1XNoTable/pngdata.001/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/pngdata.001/262143-8 378MB/s ± 0% 378MB/s ± 0% ~ (p=0.836 n=10+10) Decompress1XNoTable/normcount2/100-8 281MB/s ± 0% 282MB/s ± 1% ~ (p=0.122 n=8+10) Decompress1XNoTable/normcount2/10000-8 369MB/s ± 1% 369MB/s ± 0% ~ (p=0.912 n=10+10) Decompress1XNoTable/normcount2/262143-8 370MB/s ± 0% 370MB/s ± 1% ~ (p=0.342 n=10+10) Decompress4XNoTable/digits/100-8 197MB/s ± 0% 197MB/s ± 1% ~ (p=0.764 n=10+9) Decompress4XNoTable/digits/10000-8 594MB/s ± 0% 602MB/s ± 1% +1.35% (p=0.000 n=10+10) Decompress4XNoTable/digits/262143-8 570MB/s ± 1% 578MB/s ± 0% +1.30% (p=0.000 n=10+8) Decompress4XNoTable/gettysburg/100-8 258MB/s ± 1% 260MB/s ± 0% +0.59% (p=0.001 n=10+10) Decompress4XNoTable/gettysburg/10000-8 638MB/s ± 0% 641MB/s ± 0% +0.44% (p=0.000 n=9+9) Decompress4XNoTable/gettysburg/262143-8 573MB/s ± 1% 574MB/s ± 0% ~ (p=0.353 n=10+10) Decompress4XNoTable/twain/100-8 214MB/s ± 2% 214MB/s ± 2% ~ (p=0.853 n=10+10) Decompress4XNoTable/twain/10000-8 634MB/s ± 1% 638MB/s ± 0% +0.62% (p=0.000 n=10+10) Decompress4XNoTable/twain/262143-8 513MB/s ± 1% 517MB/s ± 0% +0.85% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/100-8 195MB/s ± 0% 194MB/s ± 0% ~ (p=0.130 n=9+9) Decompress4XNoTable/low-ent.10k/10000-8 635MB/s ± 0% 642MB/s ± 0% +1.19% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/262143-8 675MB/s ± 0% 685MB/s ± 0% +1.51% (p=0.000 n=10+10) Decompress4XNoTable/superlow-ent-10k/262143-8 673MB/s ± 1% 684MB/s ± 0% +1.70% (p=0.000 n=10+10) Decompress4XNoTable/case1/100-8 206MB/s ± 1% 206MB/s ± 0% ~ (p=0.189 n=10+9) Decompress4XNoTable/case1/10000-8 593MB/s ± 0% 601MB/s ± 0% +1.47% (p=0.000 n=10+10) Decompress4XNoTable/case1/262143-8 603MB/s ± 0% 613MB/s ± 0% +1.64% (p=0.000 n=10+10) Decompress4XNoTable/case2/100-8 201MB/s ± 0% 202MB/s ± 1% ~ (p=0.053 n=9+10) Decompress4XNoTable/case2/10000-8 610MB/s ± 0% 618MB/s ± 0% +1.30% (p=0.000 n=9+10) Decompress4XNoTable/case2/262143-8 622MB/s ± 1% 634MB/s ± 0% +1.90% (p=0.000 n=9+8) Decompress4XNoTable/case3/100-8 197MB/s ± 1% 198MB/s ± 0% +0.53% (p=0.001 n=9+10) Decompress4XNoTable/case3/10000-8 606MB/s ± 0% 615MB/s ± 0% +1.49% (p=0.000 n=8+10) Decompress4XNoTable/case3/262143-8 613MB/s ± 1% 622MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XNoTable/pngdata.001/100-8 212MB/s ± 1% 211MB/s ± 0% ~ (p=0.136 n=9+9) Decompress4XNoTable/pngdata.001/10000-8 645MB/s ± 1% 649MB/s ± 1% +0.65% (p=0.000 n=9+10) Decompress4XNoTable/pngdata.001/262143-8 640MB/s ± 1% 649MB/s ± 0% +1.44% (p=0.000 n=10+10) Decompress4XNoTable/normcount2/100-8 260MB/s ± 1% 261MB/s ± 1% ~ (p=0.211 n=10+9) Decompress4XNoTable/normcount2/10000-8 584MB/s ± 1% 591MB/s ± 0% +1.33% (p=0.000 n=9+9) Decompress4XNoTable/normcount2/262143-8 588MB/s ± 1% 596MB/s ± 1% +1.39% (p=0.000 n=10+9) Decompress4XNoTableTableLog8/digits-8 583MB/s ± 1% 592MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XTable/digits-8 580MB/s ± 0% 588MB/s ± 0% +1.33% (p=0.000 n=8+10) Decompress4XTable/gettysburg-8 368MB/s ± 1% 370MB/s ± 0% +0.59% (p=0.017 n=10+9) Decompress4XTable/twain-8 510MB/s ± 0% 515MB/s ± 0% +0.99% (p=0.000 n=9+10) Decompress4XTable/low-ent.10k-8 657MB/s ± 0% 665MB/s ± 0% +1.24% (p=0.000 n=10+10) Decompress4XTable/superlow-ent-10k-8 608MB/s ± 0% 617MB/s ± 1% +1.48% (p=0.000 n=8+10) Decompress4XTable/case1-8 21.1MB/s ± 1% 21.0MB/s ± 2% ~ (p=0.223 n=10+10) Decompress4XTable/case2-8 17.6MB/s ± 0% 17.6MB/s ± 0% ~ (p=0.199 n=9+10) Decompress4XTable/case3-8 18.7MB/s ± 0% 18.7MB/s ± 0% ~ (p=0.557 n=10+8) Decompress4XTable/pngdata.001-8 633MB/s ± 1% 645MB/s ± 0% +1.90% (p=0.000 n=9+10) Decompress4XTable/normcount2-8 49.9MB/s ± 1% 49.5MB/s ± 1% -0.64% (p=0.002 n=10+10) [Geo mean] 270MB/s 271MB/s +0.36%
1 parent 51e1025 commit b0dc663

File tree

3 files changed

+382
-422
lines changed

3 files changed

+382
-422
lines changed

huff0/_generate/gen.go

Lines changed: 46 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -49,49 +49,41 @@ func (d decompress4x) generateProcedure(name string) {
4949
exhausted := GP64()
5050
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
5151

52-
limitPtr := AllocLocal(8)
52+
limit := GP64()
5353

5454
bufferOrigin := GP64()
5555
peekBits := GP64()
5656
buffer := GP64()
5757
dstEvery := GP64()
5858
table := GP64()
5959

60-
br0 := GP64()
61-
br1 := GP64()
62-
br2 := GP64()
63-
br3 := GP64()
60+
br := GP64()
6461

6562
Comment("Preload values")
6663
{
6764
ctx := Dereference(Param("ctx"))
6865
Load(ctx.Field("peekBits"), peekBits)
69-
Load(ctx.Field("out"), buffer)
70-
MOVQ(buffer, bufferOrigin)
71-
limit := Load(ctx.Field("limit"), GP64())
72-
MOVQ(limit, limitPtr)
66+
Load(ctx.Field("out"), bufferOrigin)
67+
Load(ctx.Field("limit"), limit)
7368
Load(ctx.Field("dstEvery"), dstEvery)
7469
Load(ctx.Field("tbl"), table)
75-
Load(ctx.Field("pbr0"), br0)
76-
Load(ctx.Field("pbr1"), br1)
77-
Load(ctx.Field("pbr2"), br2)
78-
Load(ctx.Field("pbr3"), br3)
70+
Load(ctx.Field("pbr"), br)
7971
}
8072

8173
Comment("Main loop")
8274
Label("main_loop")
8375

8476
MOVQ(bufferOrigin, buffer)
8577
// Check if we have space
86-
CMPQ(buffer, limitPtr)
78+
CMPQ(buffer, limit)
8779
SETGE(exhausted.As8())
88-
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
80+
d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted)
8981
ADDQ(dstEvery, buffer)
90-
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
82+
d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted)
9183
ADDQ(dstEvery, buffer)
92-
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
84+
d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted)
9385
ADDQ(dstEvery, buffer)
94-
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)
86+
d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted)
9587

9688
ADDQ(U8(2), bufferOrigin) // off += 2
9789

@@ -100,10 +92,9 @@ func (d decompress4x) generateProcedure(name string) {
10092

10193
{
10294
ctx := Dereference(Param("ctx"))
103-
tmp := Load(ctx.Field("out"), GP64())
104-
decoded := GP64()
105-
MOVQ(bufferOrigin, decoded)
106-
SUBQ(tmp, decoded)
95+
ctxout, _ := ctx.Field("out").Resolve()
96+
decoded := bufferOrigin
97+
SUBQ(ctxout.Addr, decoded)
10798
SHLQ(U8(2), decoded) // decoded *= 4
10899

109100
Store(decoded, ctx.Field("decoded"))
@@ -118,6 +109,7 @@ const bitReader_in = 0
118109
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
119110
const bitReader_value = bitReader_off + 8
120111
const bitReader_bitsRead = bitReader_value + 8
112+
const bitReader__size = bitReader_bitsRead + 8
121113

122114
func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
123115
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
@@ -157,9 +149,10 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau
157149
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
158150
MOVW(out.As16(), Mem{Base: buffer})
159151

160-
Comment("update the bitrader reader structure")
161-
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
162-
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
152+
Comment("update the bitreader structure")
153+
offset := id * bitReader__size
154+
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
155+
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
163156
}
164157

165158
func (d decompress4x) generateProcedure4x8bit(name string) {
@@ -171,49 +164,41 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
171164
exhausted := GP64() // Fixed since we need 8H
172165
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
173166

174-
bufferOrigin := AllocLocal(8)
175-
limitPtr := AllocLocal(8)
167+
bufferOrigin := GP64()
168+
limit := GP64()
176169

177170
peekBits := GP64()
178171
buffer := GP64()
179172
dstEvery := GP64()
180173
table := GP64()
181174

182-
br0 := GP64()
183-
br1 := GP64()
184-
br2 := GP64()
185-
br3 := GP64()
175+
br := GP64()
186176

187177
Comment("Preload values")
188178
{
189179
ctx := Dereference(Param("ctx"))
190180
Load(ctx.Field("peekBits"), peekBits)
191-
Load(ctx.Field("out"), buffer)
192-
MOVQ(buffer, bufferOrigin)
193-
limit := Load(ctx.Field("limit"), GP64())
194-
MOVQ(limit, limitPtr)
181+
Load(ctx.Field("out"), bufferOrigin)
182+
Load(ctx.Field("limit"), limit)
195183
Load(ctx.Field("dstEvery"), dstEvery)
196184
Load(ctx.Field("tbl"), table)
197-
Load(ctx.Field("pbr0"), br0)
198-
Load(ctx.Field("pbr1"), br1)
199-
Load(ctx.Field("pbr2"), br2)
200-
Load(ctx.Field("pbr3"), br3)
185+
Load(ctx.Field("pbr"), br)
201186
}
202187

203188
Comment("Main loop")
204189
Label("main_loop")
205190

206191
MOVQ(bufferOrigin, buffer)
207192
// Check if we have space
208-
CMPQ(buffer, limitPtr)
193+
CMPQ(buffer, limit)
209194
SETGE(exhausted.As8())
210-
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
195+
d.decodeFourValues(0, br, peekBits, table, buffer, exhausted)
211196
ADDQ(dstEvery, buffer)
212-
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
197+
d.decodeFourValues(1, br, peekBits, table, buffer, exhausted)
213198
ADDQ(dstEvery, buffer)
214-
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
199+
d.decodeFourValues(2, br, peekBits, table, buffer, exhausted)
215200
ADDQ(dstEvery, buffer)
216-
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)
201+
d.decodeFourValues(3, br, peekBits, table, buffer, exhausted)
217202

218203
ADDQ(U8(4), bufferOrigin) // off += 4
219204

@@ -222,10 +207,9 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
222207

223208
{
224209
ctx := Dereference(Param("ctx"))
225-
tmp := Load(ctx.Field("out"), GP64())
226-
decoded := GP64()
227-
MOVQ(bufferOrigin, decoded)
228-
SUBQ(tmp, decoded)
210+
ctxout, _ := ctx.Field("out").Resolve()
211+
decoded := bufferOrigin
212+
SUBQ(ctxout.Addr, decoded)
229213
SHLQ(U8(2), decoded) // decoded *= 4
230214

231215
Store(decoded, ctx.Field("decoded"))
@@ -234,7 +218,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
234218
}
235219

236220
func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
237-
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)
221+
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
238222

239223
decompress := func(valID int, outByte reg.Register) {
240224
CX := reg.CL
@@ -269,9 +253,10 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha
269253
Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)")
270254
MOVL(out.As32(), Mem{Base: buffer})
271255

272-
Comment("update the bitreader reader structure")
273-
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
274-
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
256+
Comment("update the bitreader structure")
257+
offset := id * bitReader__size
258+
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
259+
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
275260
}
276261

277262
func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
@@ -281,14 +266,15 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
281266
Commentf("br%d.fillFast32()", id)
282267
brValue = GP64()
283268
brBitsRead = GP64()
284-
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
285-
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)
269+
offset := bitReader__size * id
270+
MOVQ(Mem{Base: br, Disp: offset + bitReader_value}, brValue)
271+
MOVBQZX(Mem{Base: br, Disp: offset + bitReader_bitsRead}, brBitsRead)
286272

287273
// We must have at least 2 * max tablelog left
288274
CMPQ(brBitsRead, U8(64-atLeast))
289275
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
290276
brOffset := GP64()
291-
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)
277+
MOVQ(Mem{Base: br, Disp: offset + bitReader_off}, brOffset)
292278

293279
SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
294280
SUBQ(U8(4), brOffset) // b.off -= 4
@@ -297,7 +283,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
297283
// v = v[:4]
298284
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
299285
tmp := GP64()
300-
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)
286+
MOVQ(Mem{Base: br, Disp: offset + bitReader_in}, tmp)
301287

302288
Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
303289
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
@@ -306,7 +292,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
306292
MOVQ(brBitsRead, CX.As64())
307293
SHLQ(CX, tmp.As64())
308294

309-
MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
295+
MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off})
310296
ORQ(tmp.As64(), brValue)
311297
{
312298
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
@@ -474,11 +460,9 @@ func (d decompress1x) generateProcedure(name string) {
474460
{
475461
// calculate decoded as current `out` - initial `out`
476462
ctx := Dereference(Param("ctx"))
477-
decoded := GP64()
478-
tmp := GP64()
479-
MOVQ(buffer, decoded)
480-
Load(ctx.Field("out"), tmp)
481-
SUBQ(tmp, decoded)
463+
ctxout, _ := ctx.Field("out").Resolve()
464+
decoded := buffer
465+
SUBQ(ctxout.Addr, decoded)
482466
Store(decoded, ctx.Field("decoded"))
483467

484468
pbr := Dereference(ctx.Field("pbr"))

huff0/decompress_amd64.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
2727
const fallback8BitSize = 800
2828

2929
type decompress4xContext struct {
30-
pbr0 *bitReaderShifted
31-
pbr1 *bitReaderShifted
32-
pbr2 *bitReaderShifted
33-
pbr3 *bitReaderShifted
30+
pbr *[4]bitReaderShifted
3431
peekBits uint8
3532
out *byte
3633
dstEvery int
@@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
8986

9087
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
9188
ctx := decompress4xContext{
92-
pbr0: &br[0],
93-
pbr1: &br[1],
94-
pbr2: &br[2],
95-
pbr3: &br[3],
89+
pbr: &br,
9690
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
9791
out: &out[0],
9892
dstEvery: dstEvery,

0 commit comments

Comments
 (0)