Skip to content

Commit ddddf7f

Browse files
authored
[AArch64][GlobalISel] Split offsets of consecutive stores to aid STP … (llvm#66980)
1 parent 98f6289 commit ddddf7f

File tree

3 files changed

+558
-1
lines changed

3 files changed

+558
-1
lines changed

llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ class MachineIRBuilder {
364364
State.Observer = &Observer;
365365
}
366366

367+
GISelChangeObserver *getObserver() { return State.Observer; }
368+
367369
void stopObservingChanges() { State.Observer = nullptr; }
368370

369371
bool isObservingChanges() const { return State.Observer != nullptr; }

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp

Lines changed: 203 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
//===----------------------------------------------------------------------===//
2121

2222
#include "AArch64TargetMachine.h"
23+
#include "llvm/ADT/STLExtras.h"
2324
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
25+
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
2426
#include "llvm/CodeGen/GlobalISel/Combiner.h"
2527
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
2628
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -439,6 +441,22 @@ class AArch64PostLegalizerCombiner : public MachineFunctionPass {
439441
private:
440442
bool IsOptNone;
441443
AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
444+
445+
446+
struct StoreInfo {
447+
GStore *St = nullptr;
448+
// The G_PTR_ADD that's used by the store. We keep this to cache the
449+
// MachineInstr def.
450+
GPtrAdd *Ptr = nullptr;
451+
// The signed offset to the Ptr instruction.
452+
int64_t Offset = 0;
453+
LLT StoredType;
454+
};
455+
bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
456+
CSEMIRBuilder &MIB);
457+
458+
bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
459+
CSEMIRBuilder &MIB);
442460
};
443461
} // end anonymous namespace
444462

@@ -492,7 +510,191 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
492510
F.hasMinSize());
493511
AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
494512
RuleConfig, ST, MDT, LI);
495-
return Impl.combineMachineInstrs();
513+
bool Changed = Impl.combineMachineInstrs();
514+
515+
auto MIB = CSEMIRBuilder(MF);
516+
MIB.setCSEInfo(CSEInfo);
517+
Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
518+
return Changed;
519+
}
520+
521+
bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
522+
SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
523+
if (Stores.size() <= 2)
524+
return false;
525+
526+
// Profitabity checks:
527+
int64_t BaseOffset = Stores[0].Offset;
528+
unsigned NumPairsExpected = Stores.size() / 2;
529+
unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
530+
// Size savings will depend on whether we can fold the offset, as an
531+
// immediate of an ADD.
532+
auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
533+
if (!TLI.isLegalAddImmediate(BaseOffset))
534+
TotalInstsExpected++;
535+
int SavingsExpected = Stores.size() - TotalInstsExpected;
536+
if (SavingsExpected <= 0)
537+
return false;
538+
539+
auto &MRI = MIB.getMF().getRegInfo();
540+
541+
// We have a series of consecutive stores. Factor out the common base
542+
// pointer and rewrite the offsets.
543+
Register NewBase = Stores[0].Ptr->getReg(0);
544+
for (auto &SInfo : Stores) {
545+
// Compute a new pointer with the new base ptr and adjusted offset.
546+
MIB.setInstrAndDebugLoc(*SInfo.St);
547+
auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
548+
auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
549+
NewBase, NewOff);
550+
if (MIB.getObserver())
551+
MIB.getObserver()->changingInstr(*SInfo.St);
552+
SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
553+
if (MIB.getObserver())
554+
MIB.getObserver()->changedInstr(*SInfo.St);
555+
}
556+
LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
557+
<< " stores into a base pointer and offsets.\n");
558+
return true;
559+
}
560+
561+
static cl::opt<bool>
562+
EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
563+
cl::init(true), cl::Hidden,
564+
cl::desc("Enable consecutive memop optimization "
565+
"in AArch64PostLegalizerCombiner"));
566+
567+
bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
568+
MachineFunction &MF, CSEMIRBuilder &MIB) {
569+
// This combine needs to run after all reassociations/folds on pointer
570+
// addressing have been done, specifically those that combine two G_PTR_ADDs
571+
// with constant offsets into a single G_PTR_ADD with a combined offset.
572+
// The goal of this optimization is to undo that combine in the case where
573+
// doing so has prevented the formation of pair stores due to illegal
574+
// addressing modes of STP. The reason that we do it here is because
575+
// it's much easier to undo the transformation of a series consecutive
576+
// mem ops, than it is to detect when doing it would be a bad idea looking
577+
// at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
578+
//
579+
// An example:
580+
// G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
581+
// %off1:_(s64) = G_CONSTANT i64 4128
582+
// %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
583+
// G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
584+
// %off2:_(s64) = G_CONSTANT i64 4144
585+
// %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
586+
// G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
587+
// %off3:_(s64) = G_CONSTANT i64 4160
588+
// %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
589+
// G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
590+
bool Changed = false;
591+
auto &MRI = MF.getRegInfo();
592+
593+
if (!EnableConsecutiveMemOpOpt)
594+
return Changed;
595+
596+
SmallVector<StoreInfo, 8> Stores;
597+
// If we see a load, then we keep track of any values defined by it.
598+
// In the following example, STP formation will fail anyway because
599+
// the latter store is using a load result that appears after the
600+
// the prior store. In this situation if we factor out the offset then
601+
// we increase code size for no benefit.
602+
// G_STORE %v1:_(s64), %base:_(p0) :: (store (s64))
603+
// %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64))
604+
// G_STORE %v2:_(s64), %base:_(p0) :: (store (s64))
605+
SmallVector<Register> LoadValsSinceLastStore;
606+
607+
auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
608+
// Check if this store is consecutive to the last one.
609+
if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
610+
(Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
611+
New.Offset) ||
612+
Last.StoredType != New.StoredType)
613+
return false;
614+
615+
// Check if this store is using a load result that appears after the
616+
// last store. If so, bail out.
617+
if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
618+
return New.St->getValueReg() == LoadVal;
619+
}))
620+
return false;
621+
622+
// Check if the current offset would be too large for STP.
623+
// If not, then STP formation should be able to handle it, so we don't
624+
// need to do anything.
625+
int64_t MaxLegalOffset;
626+
switch (New.StoredType.getSizeInBits()) {
627+
case 32:
628+
MaxLegalOffset = 252;
629+
break;
630+
case 64:
631+
MaxLegalOffset = 504;
632+
break;
633+
case 128:
634+
MaxLegalOffset = 1008;
635+
break;
636+
default:
637+
llvm_unreachable("Unexpected stored type size");
638+
}
639+
if (New.Offset < MaxLegalOffset)
640+
return false;
641+
642+
// If factoring it out still wouldn't help then don't bother.
643+
return New.Offset - Stores[0].Offset <= MaxLegalOffset;
644+
};
645+
646+
auto resetState = [&]() {
647+
Stores.clear();
648+
LoadValsSinceLastStore.clear();
649+
};
650+
651+
for (auto &MBB : MF) {
652+
// We're looking inside a single BB at a time since the memset pattern
653+
// should only be in a single block.
654+
resetState();
655+
for (auto &MI : MBB) {
656+
if (auto *St = dyn_cast<GStore>(&MI)) {
657+
Register PtrBaseReg;
658+
APInt Offset;
659+
LLT StoredValTy = MRI.getType(St->getValueReg());
660+
unsigned ValSize = StoredValTy.getSizeInBits();
661+
if (ValSize < 32 || ValSize != St->getMMO().getSizeInBits())
662+
continue;
663+
664+
Register PtrReg = St->getPointerReg();
665+
if (mi_match(
666+
PtrReg, MRI,
667+
m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
668+
GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg));
669+
StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};
670+
671+
if (Stores.empty()) {
672+
Stores.push_back(New);
673+
continue;
674+
}
675+
676+
// Check if this store is a valid continuation of the sequence.
677+
auto &Last = Stores.back();
678+
if (storeIsValid(Last, New)) {
679+
Stores.push_back(New);
680+
LoadValsSinceLastStore.clear(); // Reset the load value tracking.
681+
} else {
682+
// The store isn't a valid to consider for the prior sequence,
683+
// so try to optimize what we have so far and start a new sequence.
684+
Changed |= tryOptimizeConsecStores(Stores, MIB);
685+
resetState();
686+
Stores.push_back(New);
687+
}
688+
}
689+
} else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
690+
LoadValsSinceLastStore.push_back(Ld->getDstReg());
691+
}
692+
}
693+
Changed |= tryOptimizeConsecStores(Stores, MIB);
694+
resetState();
695+
}
696+
697+
return Changed;
496698
}
497699

498700
char AArch64PostLegalizerCombiner::ID = 0;

0 commit comments

Comments
 (0)