Skip to content

Commit a448f7e

Browse files
committed
[AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines
Pre-committing tests to show improvements in a follow-up PR.
1 parent 9e215a2 commit a448f7e

File tree

1 file changed

+176
-0
lines changed

1 file changed

+176
-0
lines changed

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,179 @@ entry:
142142
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
143143
ret void
144144
}
145+
146+
; Test skipping the lower-32-bit addition if it is unnecessary.
147+
define ptr @huge_offset_low_32_unused(ptr %p) {
148+
; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149+
; GFX942_PTRADD: ; %bb.0:
150+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151+
; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152+
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155+
;
156+
; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157+
; GFX942_LEGACY: ; %bb.0:
158+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159+
; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
161+
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
162+
ret ptr %gep
163+
}
164+
165+
; Reassociate address computation if it leads to more scalar operations.
166+
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
167+
; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168+
; GFX942_PTRADD: ; %bb.0: ; %entry
169+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170+
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172+
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176+
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177+
; GFX942_PTRADD-NEXT: s_endpgm
178+
;
179+
; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180+
; GFX942_LEGACY: ; %bb.0: ; %entry
181+
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184+
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186+
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187+
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189+
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190+
; GFX942_LEGACY-NEXT: s_endpgm
191+
entry:
192+
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
193+
%voffset = zext i32 %voffset32 to i64
194+
%offset = add nuw nsw i64 %voffset, %soffset
195+
%gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset
196+
store ptr addrspace(1) %gep, ptr addrspace(1) %out
197+
ret void
198+
}
199+
200+
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
201+
; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202+
; GFX942_PTRADD: ; %bb.0: ; %entry
203+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204+
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206+
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210+
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211+
; GFX942_PTRADD-NEXT: s_endpgm
212+
;
213+
; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214+
; GFX942_LEGACY: ; %bb.0: ; %entry
215+
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218+
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220+
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221+
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223+
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224+
; GFX942_LEGACY-NEXT: s_endpgm
225+
entry:
226+
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
227+
%voffset = zext i32 %voffset32 to i64
228+
%offset = add nuw nsw i64 %soffset, %voffset
229+
%gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset
230+
store ptr addrspace(1) %gep, ptr addrspace(1) %out
231+
ret void
232+
}
233+
234+
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235+
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
236+
; GFX942_PTRADD-LABEL: shl_neg_offset:
237+
; GFX942_PTRADD: ; %bb.0:
238+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239+
; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240+
; GFX942_PTRADD-NEXT: s_nop 1
241+
; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242+
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245+
;
246+
; GFX942_LEGACY-LABEL: shl_neg_offset:
247+
; GFX942_LEGACY: ; %bb.0:
248+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249+
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250+
; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251+
; GFX942_LEGACY-NEXT: s_nop 1
252+
; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
254+
%offset = sub i64 0, %noffset
255+
%x = shl i64 %offset, %shift
256+
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
257+
ret ptr addrspace(1) %gep
258+
}
259+
260+
%complextype = type { i64, [10 x i8], float }
261+
262+
@v0 = dso_local addrspace(1) global %complextype zeroinitializer
263+
264+
; Check that offsets are folded into global addresses if possible. For example,
265+
; this is relevant when using --amdgpu-lower-module-lds-strategy=table.
266+
define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
267+
; GFX942_PTRADD-LABEL: complextype_global_gep:
268+
; GFX942_PTRADD: ; %bb.0:
269+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270+
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271+
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272+
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
273+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276+
;
277+
; GFX942_LEGACY-LABEL: complextype_global_gep:
278+
; GFX942_LEGACY: ; %bb.0:
279+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280+
; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1]
281+
; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
282+
; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
283+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
284+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
285+
%gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset
286+
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2
287+
ret ptr addrspace(1) %gep1
288+
}
289+
290+
%S = type <{ float, double }>
291+
292+
; Tests the tryFoldToMad64_32 PTRADD combine.
293+
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
294+
; GFX942_PTRADD-LABEL: fold_mad64:
295+
; GFX942_PTRADD: ; %bb.0:
296+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297+
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298+
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299+
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303+
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304+
; GFX942_PTRADD-NEXT: s_endpgm
305+
;
306+
; GFX942_LEGACY-LABEL: fold_mad64:
307+
; GFX942_LEGACY: ; %bb.0:
308+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309+
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312+
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313+
; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314+
; GFX942_LEGACY-NEXT: s_endpgm
315+
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
316+
%voffset = zext i32 %voffset32 to i64
317+
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
318+
store float 1.0, ptr addrspace(1) %p1
319+
ret void
320+
}

0 commit comments

Comments
 (0)