Skip to content

Commit 64f22e4

Browse files
agnivadenigeltao
authored andcommitted
image/jpeg: reduce bound checks from idct and fdct
Before - $gotip build -gcflags="-d=ssa/check_bce/debug=1" fdct.go idct.go ./fdct.go:89:10: Found IsInBounds ./fdct.go:90:10: Found IsInBounds ./fdct.go:91:10: Found IsInBounds ./fdct.go:92:10: Found IsInBounds ./fdct.go:93:10: Found IsInBounds ./fdct.go:94:10: Found IsInBounds ./fdct.go:95:10: Found IsInBounds ./fdct.go:96:10: Found IsInBounds ./idct.go:77:9: Found IsInBounds ./idct.go:77:27: Found IsInBounds ./idct.go:77:45: Found IsInBounds ./idct.go:78:7: Found IsInBounds ./idct.go:78:25: Found IsInBounds ./idct.go:78:43: Found IsInBounds ./idct.go:78:61: Found IsInBounds ./idct.go:79:13: Found IsInBounds ./idct.go:92:13: Found IsInBounds ./idct.go:93:12: Found IsInBounds ./idct.go:94:12: Found IsInBounds ./idct.go:95:12: Found IsInBounds ./idct.go:97:12: Found IsInBounds ./idct.go:98:12: Found IsInBounds ./idct.go:99:12: Found IsInBounds After - $gotip build -gcflags="-d=ssa/check_bce/debug=1" fdct.go idct.go ./fdct.go:90:9: Found IsSliceInBounds ./idct.go:76:11: Found IsSliceInBounds ./idct.go:145:11: Found IsSliceInBounds name old time/op new time/op delta FDCT-4 1.85µs ± 2% 1.74µs ± 1% -5.95% (p=0.000 n=10+10) IDCT-4 1.94µs ± 2% 1.89µs ± 1% -2.67% (p=0.000 n=10+9) DecodeBaseline-4 1.45ms ± 2% 1.46ms ± 1% ~ (p=0.156 n=9+10) DecodeProgressive-4 2.21ms ± 1% 2.21ms ± 1% ~ (p=0.796 n=10+10) EncodeRGBA-4 24.9ms ± 1% 25.0ms ± 1% ~ (p=0.075 n=10+10) EncodeYCbCr-4 26.1ms ± 1% 26.2ms ± 1% ~ (p=0.573 n=8+10) name old speed new speed delta DecodeBaseline-4 42.5MB/s ± 2% 42.4MB/s ± 1% ~ (p=0.162 n=9+10) DecodeProgressive-4 27.9MB/s ± 1% 27.9MB/s ± 1% ~ (p=0.796 n=10+10) EncodeRGBA-4 49.4MB/s ± 1% 49.1MB/s ± 1% ~ (p=0.066 n=10+10) EncodeYCbCr-4 35.3MB/s ± 1% 35.2MB/s ± 1% ~ (p=0.586 n=8+10) name old alloc/op new alloc/op delta DecodeBaseline-4 63.0kB ± 0% 63.0kB ± 0% ~ (all equal) DecodeProgressive-4 260kB ± 0% 260kB ± 0% ~ (all equal) EncodeRGBA-4 4.40kB ± 0% 4.40kB ± 0% ~ (all equal) EncodeYCbCr-4 4.40kB ± 0% 4.40kB ± 0% ~ (all equal) name old allocs/op new allocs/op delta DecodeBaseline-4 5.00 ± 0% 5.00 ± 0% ~ (all equal) DecodeProgressive-4 13.0 ± 0% 13.0 ± 0% ~ (all equal) EncodeRGBA-4 4.00 ± 0% 4.00 ± 0% ~ (all equal) EncodeYCbCr-4 4.00 ± 0% 4.00 ± 0% ~ (all equal) Updates golang#24499 Change-Id: I6828d077b851817503a7c1a08235763f81bdadf9 Reviewed-on: https://go-review.googlesource.com/c/go/+/167417 Run-TryBot: Agniva De Sarker <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Nigel Tao <[email protected]>
1 parent 2da9659 commit 64f22e4

File tree

2 files changed

+63
-59
lines changed

2 files changed

+63
-59
lines changed

src/image/jpeg/fdct.go

+18-16
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,16 @@ const (
8686
func fdct(b *block) {
8787
// Pass 1: process rows.
8888
for y := 0; y < 8; y++ {
89-
x0 := b[y*8+0]
90-
x1 := b[y*8+1]
91-
x2 := b[y*8+2]
92-
x3 := b[y*8+3]
93-
x4 := b[y*8+4]
94-
x5 := b[y*8+5]
95-
x6 := b[y*8+6]
96-
x7 := b[y*8+7]
89+
y8 := y * 8
90+
s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
91+
x0 := s[0]
92+
x1 := s[1]
93+
x2 := s[2]
94+
x3 := s[3]
95+
x4 := s[4]
96+
x5 := s[5]
97+
x6 := s[6]
98+
x7 := s[7]
9799

98100
tmp0 := x0 + x7
99101
tmp1 := x1 + x6
@@ -110,12 +112,12 @@ func fdct(b *block) {
110112
tmp2 = x2 - x5
111113
tmp3 = x3 - x4
112114

113-
b[y*8+0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
114-
b[y*8+4] = (tmp10 - tmp11) << pass1Bits
115+
s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
116+
s[4] = (tmp10 - tmp11) << pass1Bits
115117
z1 := (tmp12 + tmp13) * fix_0_541196100
116118
z1 += 1 << (constBits - pass1Bits - 1)
117-
b[y*8+2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
118-
b[y*8+6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
119+
s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
120+
s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
119121

120122
tmp10 = tmp0 + tmp3
121123
tmp11 = tmp1 + tmp2
@@ -134,10 +136,10 @@ func fdct(b *block) {
134136

135137
tmp12 += z1
136138
tmp13 += z1
137-
b[y*8+1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
138-
b[y*8+3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
139-
b[y*8+5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
140-
b[y*8+7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
139+
s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
140+
s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
141+
s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
142+
s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
141143
}
142144
// Pass 2: process columns.
143145
// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.

src/image/jpeg/idct.go

+45-43
Original file line numberDiff line numberDiff line change
@@ -73,30 +73,31 @@ func idct(src *block) {
7373
// Horizontal 1-D IDCT.
7474
for y := 0; y < 8; y++ {
7575
y8 := y * 8
76+
s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
7677
// If all the AC components are zero, then the IDCT is trivial.
77-
if src[y8+1] == 0 && src[y8+2] == 0 && src[y8+3] == 0 &&
78-
src[y8+4] == 0 && src[y8+5] == 0 && src[y8+6] == 0 && src[y8+7] == 0 {
79-
dc := src[y8+0] << 3
80-
src[y8+0] = dc
81-
src[y8+1] = dc
82-
src[y8+2] = dc
83-
src[y8+3] = dc
84-
src[y8+4] = dc
85-
src[y8+5] = dc
86-
src[y8+6] = dc
87-
src[y8+7] = dc
78+
if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
79+
s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
80+
dc := s[0] << 3
81+
s[0] = dc
82+
s[1] = dc
83+
s[2] = dc
84+
s[3] = dc
85+
s[4] = dc
86+
s[5] = dc
87+
s[6] = dc
88+
s[7] = dc
8889
continue
8990
}
9091

9192
// Prescale.
92-
x0 := (src[y8+0] << 11) + 128
93-
x1 := src[y8+4] << 11
94-
x2 := src[y8+6]
95-
x3 := src[y8+2]
96-
x4 := src[y8+1]
97-
x5 := src[y8+7]
98-
x6 := src[y8+5]
99-
x7 := src[y8+3]
93+
x0 := (s[0] << 11) + 128
94+
x1 := s[4] << 11
95+
x2 := s[6]
96+
x3 := s[2]
97+
x4 := s[1]
98+
x5 := s[7]
99+
x6 := s[5]
100+
x7 := s[3]
100101

101102
// Stage 1.
102103
x8 := w7 * (x4 + x5)
@@ -126,31 +127,32 @@ func idct(src *block) {
126127
x4 = (r2*(x4-x5) + 128) >> 8
127128

128129
// Stage 4.
129-
src[y8+0] = (x7 + x1) >> 8
130-
src[y8+1] = (x3 + x2) >> 8
131-
src[y8+2] = (x0 + x4) >> 8
132-
src[y8+3] = (x8 + x6) >> 8
133-
src[y8+4] = (x8 - x6) >> 8
134-
src[y8+5] = (x0 - x4) >> 8
135-
src[y8+6] = (x3 - x2) >> 8
136-
src[y8+7] = (x7 - x1) >> 8
130+
s[0] = (x7 + x1) >> 8
131+
s[1] = (x3 + x2) >> 8
132+
s[2] = (x0 + x4) >> 8
133+
s[3] = (x8 + x6) >> 8
134+
s[4] = (x8 - x6) >> 8
135+
s[5] = (x0 - x4) >> 8
136+
s[6] = (x3 - x2) >> 8
137+
s[7] = (x7 - x1) >> 8
137138
}
138139

139140
// Vertical 1-D IDCT.
140141
for x := 0; x < 8; x++ {
141142
// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
142143
// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
143144
// we do not bother to check for the all-zero case.
145+
s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
144146

145147
// Prescale.
146-
y0 := (src[8*0+x] << 8) + 8192
147-
y1 := src[8*4+x] << 8
148-
y2 := src[8*6+x]
149-
y3 := src[8*2+x]
150-
y4 := src[8*1+x]
151-
y5 := src[8*7+x]
152-
y6 := src[8*5+x]
153-
y7 := src[8*3+x]
148+
y0 := (s[8*0] << 8) + 8192
149+
y1 := s[8*4] << 8
150+
y2 := s[8*6]
151+
y3 := s[8*2]
152+
y4 := s[8*1]
153+
y5 := s[8*7]
154+
y6 := s[8*5]
155+
y7 := s[8*3]
154156

155157
// Stage 1.
156158
y8 := w7*(y4+y5) + 4
@@ -180,13 +182,13 @@ func idct(src *block) {
180182
y4 = (r2*(y4-y5) + 128) >> 8
181183

182184
// Stage 4.
183-
src[8*0+x] = (y7 + y1) >> 14
184-
src[8*1+x] = (y3 + y2) >> 14
185-
src[8*2+x] = (y0 + y4) >> 14
186-
src[8*3+x] = (y8 + y6) >> 14
187-
src[8*4+x] = (y8 - y6) >> 14
188-
src[8*5+x] = (y0 - y4) >> 14
189-
src[8*6+x] = (y3 - y2) >> 14
190-
src[8*7+x] = (y7 - y1) >> 14
185+
s[8*0] = (y7 + y1) >> 14
186+
s[8*1] = (y3 + y2) >> 14
187+
s[8*2] = (y0 + y4) >> 14
188+
s[8*3] = (y8 + y6) >> 14
189+
s[8*4] = (y8 - y6) >> 14
190+
s[8*5] = (y0 - y4) >> 14
191+
s[8*6] = (y3 - y2) >> 14
192+
s[8*7] = (y7 - y1) >> 14
191193
}
192194
}

0 commit comments

Comments
 (0)