Skip to content

Commit f59d5b1

Browse files
authored
huff0: Improve 4X decompression speed 5-10% (#437)
Improve huffman 4X decompression speed for tablelog <= 8. ``` λ benchcmp before.txt after.txt benchmark old ns/op new ns/op delta BenchmarkDecompress4XNoTable/digits-32 167490 158439 -5.40% BenchmarkDecompress4XNoTable/gettysburg-32 2762 2782 +0.72% BenchmarkDecompress4XNoTable/twain-32 578974 584448 +0.95% BenchmarkDecompress4XNoTable/low-ent.10k-32 57714 54112 -6.24% BenchmarkDecompress4XNoTable/superlow-ent-10k-32 15440 14349 -7.07% BenchmarkDecompress4XNoTable/case1-32 232 215 -7.28% BenchmarkDecompress4XNoTable/case2-32 181 172 -4.97% BenchmarkDecompress4XNoTable/case3-32 187 178 -5.28% BenchmarkDecompress4XNoTable/pngdata.001-32 78498 79716 +1.55% BenchmarkDecompress4XNoTable/normcount2-32 299 270 -9.60% BenchmarkDecompress4XTable/digits-32 167728 158709 -5.38% BenchmarkDecompress4XTable/gettysburg-32 3993 3956 -0.93% BenchmarkDecompress4XTable/twain-32 586985 584482 -0.43% BenchmarkDecompress4XTable/low-ent.10k-32 58317 54652 -6.28% BenchmarkDecompress4XTable/superlow-ent-10k-32 15895 14903 -6.24% BenchmarkDecompress4XTable/case1-32 2030 1996 -1.67% BenchmarkDecompress4XTable/case2-32 1986 1956 -1.51% BenchmarkDecompress4XTable/case3-32 2004 1980 -1.20% BenchmarkDecompress4XTable/pngdata.001-32 81922 81627 -0.36% BenchmarkDecompress4XTable/normcount2-32 1351 1339 -0.89% ```
1 parent dff5f6b commit f59d5b1

File tree

2 files changed

+236
-172
lines changed

2 files changed

+236
-172
lines changed

huff0/decompress.go

Lines changed: 113 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ type dEntrySingle struct {
2020

2121
// double-symbols decoding
2222
type dEntryDouble struct {
23-
seq uint16
23+
seq [4]byte
2424
nBits uint8
2525
len uint8
2626
}
@@ -914,7 +914,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
914914
out := dst
915915
dstEvery := (dstSize + 3) / 4
916916

917-
shift := (8 - d.actualTableLog) & 7
917+
shift := (56 + (8 - d.actualTableLog)) & 63
918918

919919
const tlSize = 1 << 8
920920
single := d.dt.single[:tlSize]
@@ -935,79 +935,91 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
935935
// Interleave 2 decodes.
936936
const stream = 0
937937
const stream2 = 1
938-
br[stream].fillFast()
939-
br[stream2].fillFast()
940-
941-
v := single[br[stream].peekByteFast()>>shift].entry
938+
br1 := &br[stream]
939+
br2 := &br[stream2]
940+
br1.fillFast()
941+
br2.fillFast()
942+
943+
v := single[uint8(br1.value>>shift)].entry
944+
v2 := single[uint8(br2.value>>shift)].entry
945+
br1.bitsRead += uint8(v)
946+
br1.value <<= v & 63
947+
br2.bitsRead += uint8(v2)
948+
br2.value <<= v2 & 63
942949
buf[off+bufoff*stream] = uint8(v >> 8)
943-
br[stream].advance(uint8(v))
944-
945-
v2 := single[br[stream2].peekByteFast()>>shift].entry
946950
buf[off+bufoff*stream2] = uint8(v2 >> 8)
947-
br[stream2].advance(uint8(v2))
948951

949-
v = single[br[stream].peekByteFast()>>shift].entry
952+
v = single[uint8(br1.value>>shift)].entry
953+
v2 = single[uint8(br2.value>>shift)].entry
954+
br1.bitsRead += uint8(v)
955+
br1.value <<= v & 63
956+
br2.bitsRead += uint8(v2)
957+
br2.value <<= v2 & 63
950958
buf[off+bufoff*stream+1] = uint8(v >> 8)
951-
br[stream].advance(uint8(v))
952-
953-
v2 = single[br[stream2].peekByteFast()>>shift].entry
954959
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
955-
br[stream2].advance(uint8(v2))
956960

957-
v = single[br[stream].peekByteFast()>>shift].entry
961+
v = single[uint8(br1.value>>shift)].entry
962+
v2 = single[uint8(br2.value>>shift)].entry
963+
br1.bitsRead += uint8(v)
964+
br1.value <<= v & 63
965+
br2.bitsRead += uint8(v2)
966+
br2.value <<= v2 & 63
958967
buf[off+bufoff*stream+2] = uint8(v >> 8)
959-
br[stream].advance(uint8(v))
960-
961-
v2 = single[br[stream2].peekByteFast()>>shift].entry
962968
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
963-
br[stream2].advance(uint8(v2))
964969

965-
v = single[br[stream].peekByteFast()>>shift].entry
966-
buf[off+bufoff*stream+3] = uint8(v >> 8)
967-
br[stream].advance(uint8(v))
968-
969-
v2 = single[br[stream2].peekByteFast()>>shift].entry
970+
v = single[uint8(br1.value>>shift)].entry
971+
v2 = single[uint8(br2.value>>shift)].entry
972+
br1.bitsRead += uint8(v)
973+
br1.value <<= v & 63
974+
br2.bitsRead += uint8(v2)
975+
br2.value <<= v2 & 63
970976
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
971-
br[stream2].advance(uint8(v2))
977+
buf[off+bufoff*stream+3] = uint8(v >> 8)
972978
}
973979

974980
{
975981
const stream = 2
976982
const stream2 = 3
977-
br[stream].fillFast()
978-
br[stream2].fillFast()
979-
980-
v := single[br[stream].peekByteFast()>>shift].entry
983+
br1 := &br[stream]
984+
br2 := &br[stream2]
985+
br1.fillFast()
986+
br2.fillFast()
987+
988+
v := single[uint8(br1.value>>shift)].entry
989+
v2 := single[uint8(br2.value>>shift)].entry
990+
br1.bitsRead += uint8(v)
991+
br1.value <<= v & 63
992+
br2.bitsRead += uint8(v2)
993+
br2.value <<= v2 & 63
981994
buf[off+bufoff*stream] = uint8(v >> 8)
982-
br[stream].advance(uint8(v))
983-
984-
v2 := single[br[stream2].peekByteFast()>>shift].entry
985995
buf[off+bufoff*stream2] = uint8(v2 >> 8)
986-
br[stream2].advance(uint8(v2))
987996

988-
v = single[br[stream].peekByteFast()>>shift].entry
997+
v = single[uint8(br1.value>>shift)].entry
998+
v2 = single[uint8(br2.value>>shift)].entry
999+
br1.bitsRead += uint8(v)
1000+
br1.value <<= v & 63
1001+
br2.bitsRead += uint8(v2)
1002+
br2.value <<= v2 & 63
9891003
buf[off+bufoff*stream+1] = uint8(v >> 8)
990-
br[stream].advance(uint8(v))
991-
992-
v2 = single[br[stream2].peekByteFast()>>shift].entry
9931004
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
994-
br[stream2].advance(uint8(v2))
9951005

996-
v = single[br[stream].peekByteFast()>>shift].entry
1006+
v = single[uint8(br1.value>>shift)].entry
1007+
v2 = single[uint8(br2.value>>shift)].entry
1008+
br1.bitsRead += uint8(v)
1009+
br1.value <<= v & 63
1010+
br2.bitsRead += uint8(v2)
1011+
br2.value <<= v2 & 63
9971012
buf[off+bufoff*stream+2] = uint8(v >> 8)
998-
br[stream].advance(uint8(v))
999-
1000-
v2 = single[br[stream2].peekByteFast()>>shift].entry
10011013
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
1002-
br[stream2].advance(uint8(v2))
1003-
1004-
v = single[br[stream].peekByteFast()>>shift].entry
1005-
buf[off+bufoff*stream+3] = uint8(v >> 8)
1006-
br[stream].advance(uint8(v))
10071014

1008-
v2 = single[br[stream2].peekByteFast()>>shift].entry
1015+
v = single[uint8(br1.value>>shift)].entry
1016+
v2 = single[uint8(br2.value>>shift)].entry
1017+
br1.bitsRead += uint8(v)
1018+
br1.value <<= v & 63
1019+
br2.bitsRead += uint8(v2)
1020+
br2.value <<= v2 & 63
10091021
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
1010-
br[stream2].advance(uint8(v2))
1022+
buf[off+bufoff*stream+3] = uint8(v >> 8)
10111023
}
10121024

10131025
off += 4
@@ -1073,7 +1085,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
10731085
}
10741086

10751087
// Read value and increment offset.
1076-
v := single[br.peekByteFast()>>shift].entry
1088+
v := single[uint8(br.value>>shift)].entry
10771089
nBits := uint8(v)
10781090
br.advance(nBits)
10791091
bitsLeft -= int(nBits)
@@ -1121,7 +1133,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
11211133
out := dst
11221134
dstEvery := (dstSize + 3) / 4
11231135

1124-
const shift = 0
1136+
const shift = 56
11251137
const tlSize = 1 << 8
11261138
const tlMask = tlSize - 1
11271139
single := d.dt.single[:tlSize]
@@ -1145,37 +1157,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
11451157
br[stream].fillFast()
11461158
br[stream2].fillFast()
11471159

1148-
v := single[br[stream].peekByteFast()>>shift].entry
1160+
v := single[uint8(br[stream].value>>shift)].entry
1161+
v2 := single[uint8(br[stream2].value>>shift)].entry
1162+
br[stream].bitsRead += uint8(v)
1163+
br[stream].value <<= v & 63
1164+
br[stream2].bitsRead += uint8(v2)
1165+
br[stream2].value <<= v2 & 63
11491166
buf[off+bufoff*stream] = uint8(v >> 8)
1150-
br[stream].advance(uint8(v))
1151-
1152-
v2 := single[br[stream2].peekByteFast()>>shift].entry
11531167
buf[off+bufoff*stream2] = uint8(v2 >> 8)
1154-
br[stream2].advance(uint8(v2))
11551168

1156-
v = single[br[stream].peekByteFast()>>shift].entry
1169+
v = single[uint8(br[stream].value>>shift)].entry
1170+
v2 = single[uint8(br[stream2].value>>shift)].entry
1171+
br[stream].bitsRead += uint8(v)
1172+
br[stream].value <<= v & 63
1173+
br[stream2].bitsRead += uint8(v2)
1174+
br[stream2].value <<= v2 & 63
11571175
buf[off+bufoff*stream+1] = uint8(v >> 8)
1158-
br[stream].advance(uint8(v))
1159-
1160-
v2 = single[br[stream2].peekByteFast()>>shift].entry
11611176
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
1162-
br[stream2].advance(uint8(v2))
11631177

1164-
v = single[br[stream].peekByteFast()>>shift].entry
1178+
v = single[uint8(br[stream].value>>shift)].entry
1179+
v2 = single[uint8(br[stream2].value>>shift)].entry
1180+
br[stream].bitsRead += uint8(v)
1181+
br[stream].value <<= v & 63
1182+
br[stream2].bitsRead += uint8(v2)
1183+
br[stream2].value <<= v2 & 63
11651184
buf[off+bufoff*stream+2] = uint8(v >> 8)
1166-
br[stream].advance(uint8(v))
1167-
1168-
v2 = single[br[stream2].peekByteFast()>>shift].entry
11691185
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
1170-
br[stream2].advance(uint8(v2))
11711186

1172-
v = single[br[stream].peekByteFast()>>shift].entry
1187+
v = single[uint8(br[stream].value>>shift)].entry
1188+
v2 = single[uint8(br[stream2].value>>shift)].entry
1189+
br[stream].bitsRead += uint8(v)
1190+
br[stream].value <<= v & 63
1191+
br[stream2].bitsRead += uint8(v2)
1192+
br[stream2].value <<= v2 & 63
11731193
buf[off+bufoff*stream+3] = uint8(v >> 8)
1174-
br[stream].advance(uint8(v))
1175-
1176-
v2 = single[br[stream2].peekByteFast()>>shift].entry
11771194
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
1178-
br[stream2].advance(uint8(v2))
11791195
}
11801196

11811197
{
@@ -1184,37 +1200,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
11841200
br[stream].fillFast()
11851201
br[stream2].fillFast()
11861202

1187-
v := single[br[stream].peekByteFast()>>shift].entry
1203+
v := single[uint8(br[stream].value>>shift)].entry
1204+
v2 := single[uint8(br[stream2].value>>shift)].entry
1205+
br[stream].bitsRead += uint8(v)
1206+
br[stream].value <<= v & 63
1207+
br[stream2].bitsRead += uint8(v2)
1208+
br[stream2].value <<= v2 & 63
11881209
buf[off+bufoff*stream] = uint8(v >> 8)
1189-
br[stream].advance(uint8(v))
1190-
1191-
v2 := single[br[stream2].peekByteFast()>>shift].entry
11921210
buf[off+bufoff*stream2] = uint8(v2 >> 8)
1193-
br[stream2].advance(uint8(v2))
11941211

1195-
v = single[br[stream].peekByteFast()>>shift].entry
1212+
v = single[uint8(br[stream].value>>shift)].entry
1213+
v2 = single[uint8(br[stream2].value>>shift)].entry
1214+
br[stream].bitsRead += uint8(v)
1215+
br[stream].value <<= v & 63
1216+
br[stream2].bitsRead += uint8(v2)
1217+
br[stream2].value <<= v2 & 63
11961218
buf[off+bufoff*stream+1] = uint8(v >> 8)
1197-
br[stream].advance(uint8(v))
1198-
1199-
v2 = single[br[stream2].peekByteFast()>>shift].entry
12001219
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
1201-
br[stream2].advance(uint8(v2))
12021220

1203-
v = single[br[stream].peekByteFast()>>shift].entry
1221+
v = single[uint8(br[stream].value>>shift)].entry
1222+
v2 = single[uint8(br[stream2].value>>shift)].entry
1223+
br[stream].bitsRead += uint8(v)
1224+
br[stream].value <<= v & 63
1225+
br[stream2].bitsRead += uint8(v2)
1226+
br[stream2].value <<= v2 & 63
12041227
buf[off+bufoff*stream+2] = uint8(v >> 8)
1205-
br[stream].advance(uint8(v))
1206-
1207-
v2 = single[br[stream2].peekByteFast()>>shift].entry
12081228
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
1209-
br[stream2].advance(uint8(v2))
12101229

1211-
v = single[br[stream].peekByteFast()>>shift].entry
1230+
v = single[uint8(br[stream].value>>shift)].entry
1231+
v2 = single[uint8(br[stream2].value>>shift)].entry
1232+
br[stream].bitsRead += uint8(v)
1233+
br[stream].value <<= v & 63
1234+
br[stream2].bitsRead += uint8(v2)
1235+
br[stream2].value <<= v2 & 63
12121236
buf[off+bufoff*stream+3] = uint8(v >> 8)
1213-
br[stream].advance(uint8(v))
1214-
1215-
v2 = single[br[stream2].peekByteFast()>>shift].entry
12161237
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
1217-
br[stream2].advance(uint8(v2))
12181238
}
12191239

12201240
off += 4
@@ -1280,7 +1300,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
12801300
}
12811301

12821302
// Read value and increment offset.
1283-
v := single[br.peekByteFast()>>shift].entry
1303+
v := single[br.peekByteFast()].entry
12841304
nBits := uint8(v)
12851305
br.advance(nBits)
12861306
bitsLeft -= int(nBits)

0 commit comments

Comments
 (0)