Skip to content

Commit 9ca3693

Browse files
authored
[AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (#78186)
1 parent 4c65787 commit 9ca3693

File tree

9 files changed

+134
-44
lines changed

9 files changed

+134
-44
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11871187
performs subtraction only if the memory value is greater than or
11881188
equal to the data value.
11891189

1190+
llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value
1191+
sign-extended from the width of the underlying PC hardware register even on
1192+
processors where the s_getpc_b64 instruction returns a zero-extended value.
1193+
11901194
============================================== ==========================================================
11911195

11921196
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,6 +1814,8 @@ def int_amdgcn_s_setreg :
18141814
// not cross a 4Gb address boundary. Use for any other purpose may not
18151815
// produce the desired results as optimizations may cause code movement,
18161816
// especially as we explicitly use IntrNoMem to allow optimizations.
1817+
// This intrinsic always returns PC sign-extended from 48 bits even if the
1818+
// s_getpc_b64 instruction returns a zero-extended value.
18171819
def int_amdgcn_s_getpc :
18181820
ClangBuiltin<"__builtin_amdgcn_s_getpc">,
18191821
DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12781278
/// values.
12791279
bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
12801280

1281+
// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1282+
// of sign-extending.
1283+
bool hasGetPCZeroExtension() const { return GFX12Insts; }
1284+
12811285
/// \returns SGPR allocation granularity supported by the subtarget.
12821286
unsigned getSGPRAllocGranule() const {
12831287
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
188188
.addImm(MFI->getGITPtrHigh())
189189
.addReg(TargetReg, RegState::ImplicitDefine);
190190
} else {
191-
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
191+
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
192192
BuildMI(MBB, I, DL, GetPC64, TargetReg);
193193
}
194194
Register GitPtrLo = MFI->getGITPtrLoReg(*MF);

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2410,13 +2410,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
24102410
// the encoding of $symbol starts 12 bytes after the start of the s_add_u32
24112411
// instruction.
24122412

2413+
int64_t Adjust = 0;
2414+
if (ST.hasGetPCZeroExtension()) {
2415+
// Fix up hardware that does not sign-extend the 48-bit PC value by
2416+
// inserting: s_sext_i32_i16 reghi, reghi
2417+
Bundler.append(
2418+
BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2419+
Adjust += 4;
2420+
}
2421+
24132422
if (OpLo.isGlobal())
2414-
OpLo.setOffset(OpLo.getOffset() + 4);
2423+
OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
24152424
Bundler.append(
24162425
BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
24172426

24182427
if (OpHi.isGlobal())
2419-
OpHi.setOffset(OpHi.getOffset() + 12);
2428+
OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
24202429
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
24212430
.addReg(RegHi)
24222431
.add(OpHi));
@@ -2480,6 +2489,19 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
24802489
case AMDGPU::S_MUL_I64_I32_PSEUDO:
24812490
MI.setDesc(get(AMDGPU::S_MUL_U64));
24822491
break;
2492+
2493+
case AMDGPU::S_GETPC_B64_pseudo:
2494+
MI.setDesc(get(AMDGPU::S_GETPC_B64));
2495+
if (ST.hasGetPCZeroExtension()) {
2496+
Register Dst = MI.getOperand(0).getReg();
2497+
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2498+
// Fix up hardware that does not sign-extend the 48-bit PC value by
2499+
// inserting: s_sext_i32_i16 dsthi, dsthi
2500+
BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2501+
DstHi)
2502+
.addReg(DstHi);
2503+
}
2504+
break;
24832505
}
24842506
return true;
24852507
}

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,11 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
292292
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
293293
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
294294

295+
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
296+
// PSEUDO includes a workaround for a hardware anomaly where some ASICs
297+
// zero-extend the result from 48 bits instead of sign-extending.
295298
let isReMaterializable = 1 in
296-
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
299+
def S_GETPC_B64_pseudo : SOP1_64_0 <"s_getpc_b64",
297300
[(set i64:$sdst, (int_amdgcn_s_getpc))]
298301
>;
299302

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -723,8 +723,9 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
723723
; GFX12-LABEL: s_buffer_load_index_across_bb:
724724
; GFX12: ; %bb.0: ; %main_body
725725
; GFX12-NEXT: s_getpc_b64 s[4:5]
726-
; GFX12-NEXT: s_add_co_u32 s4, s4, gv@gotpcrel32@lo+4
727-
; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv@gotpcrel32@hi+12
726+
; GFX12-NEXT: s_sext_i32_i16 s5, s5
727+
; GFX12-NEXT: s_add_co_u32 s4, s4, gv@gotpcrel32@lo+8
728+
; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv@gotpcrel32@hi+16
728729
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
729730
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
730731
; GFX12-NEXT: v_mov_b32_e32 v1, 0

llvm/test/CodeGen/AMDGPU/remat-sop.mir

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -581,16 +581,16 @@ body: |
581581
bb.0:
582582
583583
; GCN-LABEL: name: test_remat_s_getpc_b64
584-
; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64
585-
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64
584+
; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
585+
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo
586586
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
587587
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
588-
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64
588+
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
589589
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
590590
; GCN-NEXT: S_ENDPGM 0
591-
%0:sgpr_64 = S_GETPC_B64
592-
%1:sgpr_64 = S_GETPC_B64
593-
%2:sgpr_64 = S_GETPC_B64
591+
%0:sgpr_64 = S_GETPC_B64_pseudo
592+
%1:sgpr_64 = S_GETPC_B64_pseudo
593+
%2:sgpr_64 = S_GETPC_B64_pseudo
594594
S_NOP 0, implicit %0
595595
S_NOP 0, implicit %1
596596
S_NOP 0, implicit %2
@@ -604,15 +604,15 @@ body: |
604604
bb.0:
605605
606606
; GCN-LABEL: name: test_remat_s_getpc_b64_2
607-
; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64
608-
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64
607+
; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
608+
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo
609609
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.3, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.3, addrspace 5)
610610
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
611611
; GCN-NEXT: renamable $sgpr1 = COPY renamable $sgpr2
612612
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
613613
; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr3
614614
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
615-
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64
615+
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
616616
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.5, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.5, addrspace 5)
617617
; GCN-NEXT: renamable $sgpr0 = COPY killed renamable $sgpr1
618618
; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.4, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.4, addrspace 5)
@@ -635,9 +635,9 @@ body: |
635635
; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5)
636636
; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc
637637
; GCN-NEXT: S_ENDPGM 0
638-
%0:sreg_64 = S_GETPC_B64
639-
%1:sreg_64 = S_GETPC_B64
640-
%2:sreg_64 = S_GETPC_B64
638+
%0:sreg_64 = S_GETPC_B64_pseudo
639+
%1:sreg_64 = S_GETPC_B64_pseudo
640+
%2:sreg_64 = S_GETPC_B64_pseudo
641641
%4:sreg_32 = COPY %0.sub0:sreg_64
642642
%5:sreg_32 = COPY %0.sub1:sreg_64
643643
%6:sreg_32 = COPY %1.sub0:sreg_64

llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,86 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s
3-
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
45

56
define void @test_remat_s_getpc_b64() {
6-
; CHECK-LABEL: test_remat_s_getpc_b64:
7-
; CHECK: ; %bb.0: ; %entry
8-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9-
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
10-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
11-
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
12-
; CHECK-NEXT: v_writelane_b32 v0, s30, 0
13-
; CHECK-NEXT: s_getpc_b64 s[4:5]
14-
; CHECK-NEXT: v_writelane_b32 v0, s31, 1
15-
; CHECK-NEXT: ;;#ASMSTART
16-
; CHECK-NEXT: ;;#ASMEND
17-
; CHECK-NEXT: ;;#ASMSTART
18-
; CHECK-NEXT: ;;#ASMEND
19-
; CHECK-NEXT: s_getpc_b64 s[4:5]
20-
; CHECK-NEXT: v_mov_b32_e32 v1, s4
21-
; CHECK-NEXT: v_mov_b32_e32 v2, s5
22-
; CHECK-NEXT: global_store_dwordx2 v[1:2], v[1:2], off
23-
; CHECK-NEXT: v_readlane_b32 s31, v0, 1
24-
; CHECK-NEXT: v_readlane_b32 s30, v0, 0
25-
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
26-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
27-
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
28-
; CHECK-NEXT: s_waitcnt vmcnt(0)
29-
; CHECK-NEXT: s_setpc_b64 s[30:31]
7+
; GFX9-LABEL: test_remat_s_getpc_b64:
8+
; GFX9: ; %bb.0: ; %entry
9+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
11+
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
12+
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
13+
; GFX9-NEXT: v_writelane_b32 v0, s30, 0
14+
; GFX9-NEXT: s_getpc_b64 s[4:5]
15+
; GFX9-NEXT: v_writelane_b32 v0, s31, 1
16+
; GFX9-NEXT: ;;#ASMSTART
17+
; GFX9-NEXT: ;;#ASMEND
18+
; GFX9-NEXT: ;;#ASMSTART
19+
; GFX9-NEXT: ;;#ASMEND
20+
; GFX9-NEXT: s_getpc_b64 s[4:5]
21+
; GFX9-NEXT: v_mov_b32_e32 v1, s4
22+
; GFX9-NEXT: v_mov_b32_e32 v2, s5
23+
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[1:2], off
24+
; GFX9-NEXT: v_readlane_b32 s31, v0, 1
25+
; GFX9-NEXT: v_readlane_b32 s30, v0, 0
26+
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
27+
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
28+
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
29+
; GFX9-NEXT: s_waitcnt vmcnt(0)
30+
; GFX9-NEXT: s_setpc_b64 s[30:31]
31+
;
32+
; GFX11-LABEL: test_remat_s_getpc_b64:
33+
; GFX11: ; %bb.0: ; %entry
34+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
36+
; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
37+
; GFX11-NEXT: s_mov_b32 exec_lo, s0
38+
; GFX11-NEXT: v_writelane_b32 v0, s30, 0
39+
; GFX11-NEXT: s_getpc_b64 s[0:1]
40+
; GFX11-NEXT: ;;#ASMSTART
41+
; GFX11-NEXT: ;;#ASMEND
42+
; GFX11-NEXT: v_writelane_b32 v0, s31, 1
43+
; GFX11-NEXT: ;;#ASMSTART
44+
; GFX11-NEXT: ;;#ASMEND
45+
; GFX11-NEXT: s_getpc_b64 s[0:1]
46+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
47+
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
48+
; GFX11-NEXT: v_readlane_b32 s31, v0, 1
49+
; GFX11-NEXT: v_readlane_b32 s30, v0, 0
50+
; GFX11-NEXT: global_store_b64 v[1:2], v[1:2], off
51+
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
52+
; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
53+
; GFX11-NEXT: s_mov_b32 exec_lo, s0
54+
; GFX11-NEXT: s_waitcnt vmcnt(0)
55+
; GFX11-NEXT: s_setpc_b64 s[30:31]
56+
;
57+
; GFX12-LABEL: test_remat_s_getpc_b64:
58+
; GFX12: ; %bb.0: ; %entry
59+
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60+
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
61+
; GFX12-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
62+
; GFX12-NEXT: s_mov_b32 exec_lo, s0
63+
; GFX12-NEXT: v_writelane_b32 v0, s30, 0
64+
; GFX12-NEXT: s_getpc_b64 s[0:1]
65+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
66+
; GFX12-NEXT: s_sext_i32_i16 s1, s1
67+
; GFX12-NEXT: ;;#ASMSTART
68+
; GFX12-NEXT: ;;#ASMEND
69+
; GFX12-NEXT: v_writelane_b32 v0, s31, 1
70+
; GFX12-NEXT: ;;#ASMSTART
71+
; GFX12-NEXT: ;;#ASMEND
72+
; GFX12-NEXT: s_getpc_b64 s[0:1]
73+
; GFX12-NEXT: s_sext_i32_i16 s1, s1
74+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
75+
; GFX12-NEXT: v_readlane_b32 s31, v0, 1
76+
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
77+
; GFX12-NEXT: v_readlane_b32 s30, v0, 0
78+
; GFX12-NEXT: global_store_b64 v[1:2], v[1:2], off
79+
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
80+
; GFX12-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
81+
; GFX12-NEXT: s_mov_b32 exec_lo, s0
82+
; GFX12-NEXT: s_waitcnt vmcnt(0)
83+
; GFX12-NEXT: s_setpc_b64 s[30:31]
3084
entry:
3185
%0 = tail call i64 @llvm.amdgcn.s.getpc()
3286
tail call void asm sideeffect "", "s"(i64 %0)

0 commit comments

Comments
 (0)