Skip to content

Commit 430e96b

Browse files
kerbowasearlmc1
authored andcommitted
Cherry-pick to mainline for SWDEV-240525
[AMDGPU] Allow spilling FP to memory If there are no available lanes in a reserved VGPR, no free SGPR, and no unused CSR VGPR when trying to save the FP it needs to be spilled to memory as a last resort. This can be done in the prolog/epilog if we manually add the spill and manage exec. Differential Revision: https://reviews.llvm.org/D79610 Change-Id: Ifb31cfc27860117c1c5c0cee50cbefd3d6c5eaa4
1 parent 04133d8 commit 430e96b

File tree

2 files changed

+244
-65
lines changed

2 files changed

+244
-65
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 135 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,47 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
604604
llvm_unreachable("Invalid TargetStackID::Value");
605605
}
606606

607+
// Activate all lanes, returns saved exec.
608+
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
609+
MachineFunction &MF,
610+
MachineBasicBlock &MBB,
611+
MachineBasicBlock::iterator MBBI,
612+
bool IsProlog) {
613+
Register ScratchExecCopy;
614+
MachineRegisterInfo &MRI = MF.getRegInfo();
615+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
616+
const SIInstrInfo *TII = ST.getInstrInfo();
617+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
618+
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
619+
DebugLoc DL;
620+
621+
if (LiveRegs.empty()) {
622+
if (IsProlog) {
623+
LiveRegs.init(TRI);
624+
LiveRegs.addLiveIns(MBB);
625+
if (FuncInfo->SGPRForFPSaveRestoreCopy)
626+
LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
627+
} else {
628+
// In epilog.
629+
LiveRegs.init(*ST.getRegisterInfo());
630+
LiveRegs.addLiveOuts(MBB);
631+
LiveRegs.stepBackward(*MBBI);
632+
}
633+
}
634+
635+
ScratchExecCopy = findScratchNonCalleeSaveRegister(
636+
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
637+
638+
if (!IsProlog)
639+
LiveRegs.removeReg(ScratchExecCopy);
640+
641+
const unsigned OrSaveExec =
642+
ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
643+
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
644+
645+
return ScratchExecCopy;
646+
}
647+
607648
void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
608649
MachineBasicBlock::iterator MBBI,
609650
const DebugLoc &DL) const {
@@ -684,11 +725,24 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
684725

685726
emitPrologueEntryCFI(MBB, MBBI, DL);
686727

728+
bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
729+
bool SpillFPToMemory = false;
730+
// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
731+
// Otherwise we are spilling the FP to memory.
732+
if (HasFPSaveIndex) {
733+
SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
734+
TargetStackID::SGPRSpill;
735+
}
736+
687737
// Emit the copy if we need an FP, and are using a free SGPR to save it.
688738
if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
689739
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
690740
.addReg(FramePtrReg)
691741
.setMIFlag(MachineInstr::FrameSetup);
742+
// Make the register live throughout the function.
743+
for (MachineBasicBlock &MBB : MF)
744+
MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
745+
692746
buildCFI(
693747
MBB, MBBI, DL,
694748
MCCFIInstruction::createRegister(
@@ -701,25 +755,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
701755
if (!Reg.FI.hasValue())
702756
continue;
703757

704-
if (ScratchExecCopy == AMDGPU::NoRegister) {
705-
if (LiveRegs.empty()) {
706-
LiveRegs.init(TRI);
707-
LiveRegs.addLiveIns(MBB);
708-
if (FuncInfo->SGPRForFPSaveRestoreCopy)
709-
LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
710-
}
711-
712-
ScratchExecCopy
713-
= findScratchNonCalleeSaveRegister(MRI, LiveRegs,
714-
*TRI.getWaveMaskRegClass());
715-
assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
716-
717-
const unsigned OrSaveExec = ST.isWave32() ?
718-
AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
719-
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
720-
ScratchExecCopy)
721-
.addImm(-1);
722-
}
758+
if (!ScratchExecCopy)
759+
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
723760

724761
int FI = Reg.FI.getValue();
725762

@@ -733,6 +770,29 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
733770
MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
734771
}
735772

773+
if (HasFPSaveIndex && SpillFPToMemory) {
774+
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
775+
assert(!MFI.isDeadObjectIndex(FI));
776+
777+
if (!ScratchExecCopy)
778+
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
779+
780+
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
781+
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
782+
783+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
784+
.addReg(FramePtrReg);
785+
786+
buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
787+
FuncInfo->getScratchRSrcReg(), StackPtrReg,
788+
FuncInfo->FramePointerSaveIndex.getValue());
789+
790+
buildCFI(MBB, MBBI, DL,
791+
MCCFIInstruction::createOffset(
792+
nullptr, MCRI->getDwarfRegNum(FramePtrReg, false),
793+
MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
794+
}
795+
736796
if (ScratchExecCopy) {
737797
// FIXME: Split block and make terminator.
738798
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -780,12 +840,14 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
780840
buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill);
781841
}
782842

783-
if (FuncInfo->FramePointerSaveIndex) {
843+
// In this case, spill the FP to a reserved VGPR.
844+
if (HasFPSaveIndex && !SpillFPToMemory) {
784845
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
785-
assert(!MFI.isDeadObjectIndex(FI) &&
786-
MFI.getStackID(FI) == TargetStackID::SGPRSpill);
787-
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
788-
= FuncInfo->getSGPRToVGPRSpills(FI);
846+
assert(!MFI.isDeadObjectIndex(FI));
847+
848+
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
849+
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
850+
FuncInfo->getSGPRToVGPRSpills(FI);
789851
assert(Spill.size() == 1);
790852

791853
// Save FP before setting it up.
@@ -880,8 +942,14 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
880942
const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
881943
const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
882944

945+
bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
946+
bool SpillFPToMemory = false;
947+
if (HasFPSaveIndex) {
948+
SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
949+
TargetStackID::SGPRSpill;
950+
}
951+
883952
if (RoundedSize != 0 && hasFP(MF)) {
884-
const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
885953
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
886954
.addReg(StackPtrReg)
887955
.addImm(RoundedSize * ST.getWavefrontSize())
@@ -894,19 +962,31 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
894962
.setMIFlag(MachineInstr::FrameSetup);
895963
}
896964

897-
if (FuncInfo->FramePointerSaveIndex) {
965+
Register ScratchExecCopy;
966+
if (HasFPSaveIndex) {
898967
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
899-
900-
assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
901-
MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
902-
903-
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
904-
= FuncInfo->getSGPRToVGPRSpills(FI);
905-
assert(Spill.size() == 1);
906-
BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
907-
FramePtrReg)
908-
.addReg(Spill[0].VGPR)
909-
.addImm(Spill[0].Lane);
968+
assert(!MFI.isDeadObjectIndex(FI));
969+
if (SpillFPToMemory) {
970+
if (!ScratchExecCopy)
971+
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
972+
973+
MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
974+
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
975+
buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
976+
FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
977+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
978+
.addReg(TempVGPR, RegState::Kill);
979+
} else {
980+
// Reload from VGPR spill.
981+
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
982+
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
983+
FuncInfo->getSGPRToVGPRSpills(FI);
984+
assert(Spill.size() == 1);
985+
BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
986+
FramePtrReg)
987+
.addReg(Spill[0].VGPR)
988+
.addImm(Spill[0].Lane);
989+
}
910990
}
911991

912992
if (hasFP(MF)) {
@@ -915,31 +995,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
915995
nullptr, MCRI->getDwarfRegNum(StackPtrReg, false)));
916996
}
917997

918-
unsigned ScratchExecCopy = AMDGPU::NoRegister;
919998
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
920999
: FuncInfo->getSGPRSpillVGPRs()) {
9211000
if (!Reg.FI.hasValue())
9221001
continue;
9231002

924-
const SIRegisterInfo &TRI = TII->getRegisterInfo();
925-
if (!ScratchExecCopy) {
926-
// See emitPrologue
927-
if (LiveRegs.empty()) {
928-
LiveRegs.init(*ST.getRegisterInfo());
929-
LiveRegs.addLiveOuts(MBB);
930-
LiveRegs.stepBackward(*MBBI);
931-
}
932-
933-
ScratchExecCopy = findScratchNonCalleeSaveRegister(
934-
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
935-
LiveRegs.removeReg(ScratchExecCopy);
936-
937-
const unsigned OrSaveExec =
938-
ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
939-
940-
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
941-
.addImm(-1);
942-
}
1003+
if (!ScratchExecCopy)
1004+
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
9431005

9441006
buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
9451007
FuncInfo->getScratchRSrcReg(),
@@ -1045,7 +1107,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
10451107
if (MFI->isEntryFunction())
10461108
return;
10471109

1048-
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1110+
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
10491111
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10501112
const SIRegisterInfo *TRI = ST.getRegisterInfo();
10511113

@@ -1080,12 +1142,14 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
10801142
if (!HasFP)
10811143
return;
10821144

1145+
// We need to save and restore the current FP.
1146+
1147+
// 1: If there is already a VGPR with free lanes, use it. We
1148+
// may already have to pay the penalty for spilling a CSR VGPR.
10831149
if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
10841150
int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
10851151
TargetStackID::SGPRSpill);
10861152

1087-
// If there is already a VGPR with free lanes, use it. We may already have
1088-
// to pay the penalty for spilling a CSR VGPR.
10891153
if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
10901154
llvm_unreachable("allocate SGPR spill should have worked");
10911155

@@ -1098,16 +1162,22 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
10981162
return;
10991163
}
11001164

1165+
// 2: Next, try to save the FP in an unused SGPR.
11011166
MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
11021167

11031168
if (!MFI->SGPRForFPSaveRestoreCopy) {
1104-
// There's no free lane to spill, and no free register to save FP, so we're
1105-
// forced to spill another VGPR to use for the spill.
11061169
int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
11071170
TargetStackID::SGPRSpill);
1108-
if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1109-
llvm_unreachable("allocate SGPR spill should have worked");
1110-
MFI->FramePointerSaveIndex = NewFI;
1171+
1172+
if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
1173+
// 3: There's no free lane to spill, and no free register to save FP, so
1174+
// we're forced to spill another VGPR to use for the spill.
1175+
MFI->FramePointerSaveIndex = NewFI;
1176+
} else {
1177+
// 4: If all else fails, spill the FP to memory.
1178+
MFI->FramePointerSaveIndex =
1179+
FrameInfo.CreateSpillStackObject(4, Align(4));
1180+
}
11111181

11121182
LLVM_DEBUG(
11131183
auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();

0 commit comments

Comments
 (0)