@@ -142,3 +142,179 @@ entry:
142
142
tail call void @llvm.memcpy.p1.p4.i64 (ptr addrspace (1 ) noundef nonnull align 1 %dst , ptr addrspace (4 ) noundef nonnull align 1 %src , i64 16 , i1 false )
143
143
ret void
144
144
}
145
+
146
+ ; Test skipping the lower-32-bit addition if it is unnecessary.
147
+ define ptr @huge_offset_low_32_unused (ptr %p ) {
148
+ ; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149
+ ; GFX942_PTRADD: ; %bb.0:
150
+ ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151
+ ; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152
+ ; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154
+ ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155
+ ;
156
+ ; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157
+ ; GFX942_LEGACY: ; %bb.0:
158
+ ; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159
+ ; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160
+ ; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
161
+ %gep = getelementptr inbounds i8 , ptr %p , i64 u0x100000000
162
+ ret ptr %gep
163
+ }
164
+
165
+ ; Reassociate address computation if it leads to more scalar operations.
166
+ define amdgpu_kernel void @reassoc_scalar_r (ptr addrspace (1 ) %out , ptr addrspace (1 ) %p , i64 %soffset ) {
167
+ ; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168
+ ; GFX942_PTRADD: ; %bb.0: ; %entry
169
+ ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170
+ ; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171
+ ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172
+ ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173
+ ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176
+ ; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177
+ ; GFX942_PTRADD-NEXT: s_endpgm
178
+ ;
179
+ ; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180
+ ; GFX942_LEGACY: ; %bb.0: ; %entry
181
+ ; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182
+ ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183
+ ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184
+ ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185
+ ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186
+ ; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187
+ ; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188
+ ; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189
+ ; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190
+ ; GFX942_LEGACY-NEXT: s_endpgm
191
+ entry:
192
+ %voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
193
+ %voffset = zext i32 %voffset32 to i64
194
+ %offset = add nuw nsw i64 %voffset , %soffset
195
+ %gep = getelementptr i8 , ptr addrspace (1 ) %p , i64 %offset
196
+ store ptr addrspace (1 ) %gep , ptr addrspace (1 ) %out
197
+ ret void
198
+ }
199
+
200
+ define amdgpu_kernel void @reassoc_scalar_l (ptr addrspace (1 ) %out , ptr addrspace (1 ) %p , i64 %soffset ) {
201
+ ; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202
+ ; GFX942_PTRADD: ; %bb.0: ; %entry
203
+ ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204
+ ; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205
+ ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206
+ ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207
+ ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210
+ ; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211
+ ; GFX942_PTRADD-NEXT: s_endpgm
212
+ ;
213
+ ; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214
+ ; GFX942_LEGACY: ; %bb.0: ; %entry
215
+ ; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216
+ ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217
+ ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218
+ ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219
+ ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220
+ ; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221
+ ; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222
+ ; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223
+ ; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224
+ ; GFX942_LEGACY-NEXT: s_endpgm
225
+ entry:
226
+ %voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
227
+ %voffset = zext i32 %voffset32 to i64
228
+ %offset = add nuw nsw i64 %soffset , %voffset
229
+ %gep = getelementptr i8 , ptr addrspace (1 ) %p , i64 %offset
230
+ store ptr addrspace (1 ) %gep , ptr addrspace (1 ) %out
231
+ ret void
232
+ }
233
+
234
+ ; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235
+ define ptr addrspace (1 ) @shl_neg_offset (ptr addrspace (1 ) %p , i64 %noffset , i64 %shift ) {
236
+ ; GFX942_PTRADD-LABEL: shl_neg_offset:
237
+ ; GFX942_PTRADD: ; %bb.0:
238
+ ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239
+ ; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240
+ ; GFX942_PTRADD-NEXT: s_nop 1
241
+ ; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242
+ ; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244
+ ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245
+ ;
246
+ ; GFX942_LEGACY-LABEL: shl_neg_offset:
247
+ ; GFX942_LEGACY: ; %bb.0:
248
+ ; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249
+ ; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250
+ ; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251
+ ; GFX942_LEGACY-NEXT: s_nop 1
252
+ ; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253
+ ; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
254
+ %offset = sub i64 0 , %noffset
255
+ %x = shl i64 %offset , %shift
256
+ %gep = getelementptr inbounds i8 , ptr addrspace (1 ) %p , i64 %x
257
+ ret ptr addrspace (1 ) %gep
258
+ }
259
+
260
+ %complextype = type { i64 , [10 x i8 ], float }
261
+
262
+ @v0 = dso_local addrspace (1 ) global %complextype zeroinitializer
263
+
264
+ ; Check that offsets are folded into global addresses if possible. For example,
265
+ ; this is relevant when using --amdgpu-lower-module-lds-strategy=table.
266
+ define ptr addrspace (1 ) @complextype_global_gep (i64 %offset ) {
267
+ ; GFX942_PTRADD-LABEL: complextype_global_gep:
268
+ ; GFX942_PTRADD: ; %bb.0:
269
+ ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270
+ ; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271
+ ; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272
+ ; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
273
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275
+ ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276
+ ;
277
+ ; GFX942_LEGACY-LABEL: complextype_global_gep:
278
+ ; GFX942_LEGACY: ; %bb.0:
279
+ ; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280
+ ; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1]
281
+ ; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
282
+ ; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
283
+ ; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
284
+ ; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
285
+ %gep0 = getelementptr inbounds %complextype , ptr addrspace (1 ) @v0 , i64 0 , i32 1 , i64 %offset
286
+ %gep1 = getelementptr inbounds i8 , ptr addrspace (1 ) %gep0 , i64 2
287
+ ret ptr addrspace (1 ) %gep1
288
+ }
289
+
290
+ %S = type <{ float , double }>
291
+
292
+ ; Tests the tryFoldToMad64_32 PTRADD combine.
293
+ define amdgpu_kernel void @fold_mad64 (ptr addrspace (1 ) %p ) {
294
+ ; GFX942_PTRADD-LABEL: fold_mad64:
295
+ ; GFX942_PTRADD: ; %bb.0:
296
+ ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297
+ ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298
+ ; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299
+ ; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300
+ ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301
+ ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302
+ ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303
+ ; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304
+ ; GFX942_PTRADD-NEXT: s_endpgm
305
+ ;
306
+ ; GFX942_LEGACY-LABEL: fold_mad64:
307
+ ; GFX942_LEGACY: ; %bb.0:
308
+ ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309
+ ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310
+ ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311
+ ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312
+ ; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313
+ ; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314
+ ; GFX942_LEGACY-NEXT: s_endpgm
315
+ %voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
316
+ %voffset = zext i32 %voffset32 to i64
317
+ %p1 = getelementptr inbounds %S , ptr addrspace (1 ) %p , i64 %voffset , i32 0
318
+ store float 1 .0 , ptr addrspace (1 ) %p1
319
+ ret void
320
+ }
0 commit comments