Skip to content

Commit 7ec97b4

Browse files
SC llvm teamSC llvm team
authored andcommitted
Merged main:985362e2f38f into amd-gfx:9003dd05f959
Local branch amd-gfx 9003dd0 Merged main:9b6b2a0cec53 into amd-gfx:80557850aa2c Remote branch main 985362e [AArch64][GlobalISel] Avoid running the shl(zext(a), C) -> zext(shl(a, C)) combine. (llvm#67045)
2 parents 9003dd0 + 985362e commit 7ec97b4

File tree

34 files changed

+1367
-440
lines changed

34 files changed

+1367
-440
lines changed

compiler-rt/lib/hwasan/hwasan_report.cpp

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,21 @@ static uptr GetGlobalSizeFromDescriptor(uptr ptr) {
323323

324324
void ReportStats() {}
325325

326+
constexpr uptr kDumpWidth = 16;
327+
constexpr uptr kShadowLines = 17;
328+
constexpr uptr kShadowDumpSize = kShadowLines * kDumpWidth;
329+
330+
constexpr uptr kShortLines = 3;
331+
constexpr uptr kShortDumpSize = kShortLines * kDumpWidth;
332+
constexpr uptr kShortDumpOffset = (kShadowLines - kShortLines) / 2 * kDumpWidth;
333+
334+
static uptr GetPrintTagStart(uptr addr) {
335+
addr = MemToShadow(addr);
336+
addr = RoundDownTo(addr, kDumpWidth);
337+
addr -= kDumpWidth * (kShadowLines / 2);
338+
return addr;
339+
}
340+
326341
template <typename PrintTag>
327342
static void PrintTagInfoAroundAddr(uptr addr, uptr num_rows,
328343
InternalScopedString &s,
@@ -352,7 +367,7 @@ static void PrintTagsAroundAddr(uptr addr, GetTag get_tag,
352367
"Memory tags around the buggy address (one tag corresponds to %zd "
353368
"bytes):\n",
354369
kShadowAlignment);
355-
PrintTagInfoAroundAddr(addr, 17, s,
370+
PrintTagInfoAroundAddr(addr, kShadowLines, s,
356371
[&](InternalScopedString &s, uptr tag_addr) {
357372
tag_t tag = get_tag(tag_addr);
358373
s.AppendF("%02x", tag);
@@ -362,7 +377,7 @@ static void PrintTagsAroundAddr(uptr addr, GetTag get_tag,
362377
"Tags for short granules around the buggy address (one tag corresponds "
363378
"to %zd bytes):\n",
364379
kShadowAlignment);
365-
PrintTagInfoAroundAddr(addr, 3, s,
380+
PrintTagInfoAroundAddr(addr, kShortLines, s,
366381
[&](InternalScopedString &s, uptr tag_addr) {
367382
tag_t tag = get_tag(tag_addr);
368383
if (tag >= 1 && tag <= kShadowAlignment) {
@@ -439,8 +454,8 @@ class BaseReport {
439454

440455
struct Shadow {
441456
uptr addr = 0;
442-
tag_t tags[512] = {};
443-
tag_t short_tags[ARRAY_SIZE(tags)] = {};
457+
tag_t tags[kShadowDumpSize] = {};
458+
tag_t short_tags[kShortDumpSize] = {};
444459
};
445460

446461
sptr FindMismatchOffset() const;
@@ -508,16 +523,19 @@ BaseReport::Shadow BaseReport::CopyShadow() const {
508523
if (!MemIsApp(untagged_addr))
509524
return result;
510525

511-
result.addr = MemToShadow(untagged_addr) - ARRAY_SIZE(result.tags) / 2;
512-
for (uptr i = 0; i < ARRAY_SIZE(result.tags); ++i) {
513-
uptr tag_addr = result.addr + i;
526+
result.addr = GetPrintTagStart(untagged_addr + mismatch_offset);
527+
uptr tag_addr = result.addr;
528+
uptr short_end = kShortDumpOffset + ARRAY_SIZE(shadow.short_tags);
529+
for (uptr i = 0; i < ARRAY_SIZE(result.tags); ++i, ++tag_addr) {
514530
if (!MemIsShadow(tag_addr))
515531
continue;
516532
result.tags[i] = *reinterpret_cast<tag_t *>(tag_addr);
533+
if (i < kShortDumpOffset || i >= short_end)
534+
continue;
517535
uptr granule_addr = ShadowToMem(tag_addr);
518536
if (1 <= result.tags[i] && result.tags[i] <= kShadowAlignment &&
519537
IsAccessibleMemoryRange(granule_addr, kShadowAlignment)) {
520-
result.short_tags[i] =
538+
result.short_tags[i - kShortDumpOffset] =
521539
*reinterpret_cast<tag_t *>(granule_addr + kShadowAlignment - 1);
522540
}
523541
}
@@ -532,8 +550,8 @@ tag_t BaseReport::GetTagCopy(uptr addr) const {
532550
}
533551

534552
tag_t BaseReport::GetShortTagCopy(uptr addr) const {
535-
CHECK_GE(addr, shadow.addr);
536-
uptr idx = addr - shadow.addr;
553+
CHECK_GE(addr, shadow.addr + kShortDumpOffset);
554+
uptr idx = addr - shadow.addr - kShortDumpOffset;
537555
CHECK_LT(idx, ARRAY_SIZE(shadow.short_tags));
538556
return shadow.short_tags[idx];
539557
}

llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ class MachineIRBuilder {
364364
State.Observer = &Observer;
365365
}
366366

367+
GISelChangeObserver *getObserver() { return State.Observer; }
368+
367369
void stopObservingChanges() { State.Observer = nullptr; }
368370

369371
bool isObservingChanges() const { return State.Observer != nullptr; }

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4147,6 +4147,12 @@ class TargetLowering : public TargetLoweringBase {
41474147
return true;
41484148
}
41494149

4150+
/// GlobalISel - return true if it's profitable to perform the combine:
4151+
/// shl ([sza]ext x), y => zext (shl x, y)
4152+
virtual bool isDesirableToPullExtFromShl(const MachineInstr &MI) const {
4153+
return true;
4154+
}
4155+
41504156
// Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and
41514157
// optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of
41524158
// writing this) is:

llvm/include/llvm/Config/llvm-config.h.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/* Indicate that this is LLVM compiled from the amd-gfx branch. */
1818
#define LLVM_HAVE_BRANCH_AMD_GFX
19-
#define LLVM_MAIN_REVISION 475561
19+
#define LLVM_MAIN_REVISION 475568
2020

2121
/* Define if LLVM_ENABLE_DUMP is enabled */
2222
#cmakedefine LLVM_ENABLE_DUMP

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1719,6 +1719,8 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
17191719
bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
17201720
RegisterImmPair &MatchData) {
17211721
assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
1722+
if (!getTargetLowering().isDesirableToPullExtFromShl(MI))
1723+
return false;
17221724

17231725
Register LHS = MI.getOperand(1).getReg();
17241726

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,10 @@ class AArch64TargetLowering : public TargetLowering {
690690
bool isDesirableToCommuteWithShift(const SDNode *N,
691691
CombineLevel Level) const override;
692692

693+
bool isDesirableToPullExtFromShl(const MachineInstr &MI) const override {
694+
return false;
695+
}
696+
693697
/// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
694698
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
695699

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp

Lines changed: 203 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
//===----------------------------------------------------------------------===//
2121

2222
#include "AArch64TargetMachine.h"
23+
#include "llvm/ADT/STLExtras.h"
2324
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
25+
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
2426
#include "llvm/CodeGen/GlobalISel/Combiner.h"
2527
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
2628
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -439,6 +441,22 @@ class AArch64PostLegalizerCombiner : public MachineFunctionPass {
439441
private:
440442
bool IsOptNone;
441443
AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
444+
445+
446+
struct StoreInfo {
447+
GStore *St = nullptr;
448+
// The G_PTR_ADD that's used by the store. We keep this to cache the
449+
// MachineInstr def.
450+
GPtrAdd *Ptr = nullptr;
451+
// The signed offset to the Ptr instruction.
452+
int64_t Offset = 0;
453+
LLT StoredType;
454+
};
455+
bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
456+
CSEMIRBuilder &MIB);
457+
458+
bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
459+
CSEMIRBuilder &MIB);
442460
};
443461
} // end anonymous namespace
444462

@@ -492,7 +510,191 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
492510
F.hasMinSize());
493511
AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
494512
RuleConfig, ST, MDT, LI);
495-
return Impl.combineMachineInstrs();
513+
bool Changed = Impl.combineMachineInstrs();
514+
515+
auto MIB = CSEMIRBuilder(MF);
516+
MIB.setCSEInfo(CSEInfo);
517+
Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
518+
return Changed;
519+
}
520+
521+
bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
522+
SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
523+
if (Stores.size() <= 2)
524+
return false;
525+
526+
// Profitabity checks:
527+
int64_t BaseOffset = Stores[0].Offset;
528+
unsigned NumPairsExpected = Stores.size() / 2;
529+
unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
530+
// Size savings will depend on whether we can fold the offset, as an
531+
// immediate of an ADD.
532+
auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
533+
if (!TLI.isLegalAddImmediate(BaseOffset))
534+
TotalInstsExpected++;
535+
int SavingsExpected = Stores.size() - TotalInstsExpected;
536+
if (SavingsExpected <= 0)
537+
return false;
538+
539+
auto &MRI = MIB.getMF().getRegInfo();
540+
541+
// We have a series of consecutive stores. Factor out the common base
542+
// pointer and rewrite the offsets.
543+
Register NewBase = Stores[0].Ptr->getReg(0);
544+
for (auto &SInfo : Stores) {
545+
// Compute a new pointer with the new base ptr and adjusted offset.
546+
MIB.setInstrAndDebugLoc(*SInfo.St);
547+
auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
548+
auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
549+
NewBase, NewOff);
550+
if (MIB.getObserver())
551+
MIB.getObserver()->changingInstr(*SInfo.St);
552+
SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
553+
if (MIB.getObserver())
554+
MIB.getObserver()->changedInstr(*SInfo.St);
555+
}
556+
LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
557+
<< " stores into a base pointer and offsets.\n");
558+
return true;
559+
}
560+
561+
static cl::opt<bool>
562+
EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
563+
cl::init(true), cl::Hidden,
564+
cl::desc("Enable consecutive memop optimization "
565+
"in AArch64PostLegalizerCombiner"));
566+
567+
bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
568+
MachineFunction &MF, CSEMIRBuilder &MIB) {
569+
// This combine needs to run after all reassociations/folds on pointer
570+
// addressing have been done, specifically those that combine two G_PTR_ADDs
571+
// with constant offsets into a single G_PTR_ADD with a combined offset.
572+
// The goal of this optimization is to undo that combine in the case where
573+
// doing so has prevented the formation of pair stores due to illegal
574+
// addressing modes of STP. The reason that we do it here is because
575+
// it's much easier to undo the transformation of a series consecutive
576+
// mem ops, than it is to detect when doing it would be a bad idea looking
577+
// at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
578+
//
579+
// An example:
580+
// G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
581+
// %off1:_(s64) = G_CONSTANT i64 4128
582+
// %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
583+
// G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
584+
// %off2:_(s64) = G_CONSTANT i64 4144
585+
// %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
586+
// G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
587+
// %off3:_(s64) = G_CONSTANT i64 4160
588+
// %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
589+
// G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
590+
bool Changed = false;
591+
auto &MRI = MF.getRegInfo();
592+
593+
if (!EnableConsecutiveMemOpOpt)
594+
return Changed;
595+
596+
SmallVector<StoreInfo, 8> Stores;
597+
// If we see a load, then we keep track of any values defined by it.
598+
// In the following example, STP formation will fail anyway because
599+
// the latter store is using a load result that appears after the
600+
// the prior store. In this situation if we factor out the offset then
601+
// we increase code size for no benefit.
602+
// G_STORE %v1:_(s64), %base:_(p0) :: (store (s64))
603+
// %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64))
604+
// G_STORE %v2:_(s64), %base:_(p0) :: (store (s64))
605+
SmallVector<Register> LoadValsSinceLastStore;
606+
607+
auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
608+
// Check if this store is consecutive to the last one.
609+
if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
610+
(Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
611+
New.Offset) ||
612+
Last.StoredType != New.StoredType)
613+
return false;
614+
615+
// Check if this store is using a load result that appears after the
616+
// last store. If so, bail out.
617+
if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
618+
return New.St->getValueReg() == LoadVal;
619+
}))
620+
return false;
621+
622+
// Check if the current offset would be too large for STP.
623+
// If not, then STP formation should be able to handle it, so we don't
624+
// need to do anything.
625+
int64_t MaxLegalOffset;
626+
switch (New.StoredType.getSizeInBits()) {
627+
case 32:
628+
MaxLegalOffset = 252;
629+
break;
630+
case 64:
631+
MaxLegalOffset = 504;
632+
break;
633+
case 128:
634+
MaxLegalOffset = 1008;
635+
break;
636+
default:
637+
llvm_unreachable("Unexpected stored type size");
638+
}
639+
if (New.Offset < MaxLegalOffset)
640+
return false;
641+
642+
// If factoring it out still wouldn't help then don't bother.
643+
return New.Offset - Stores[0].Offset <= MaxLegalOffset;
644+
};
645+
646+
auto resetState = [&]() {
647+
Stores.clear();
648+
LoadValsSinceLastStore.clear();
649+
};
650+
651+
for (auto &MBB : MF) {
652+
// We're looking inside a single BB at a time since the memset pattern
653+
// should only be in a single block.
654+
resetState();
655+
for (auto &MI : MBB) {
656+
if (auto *St = dyn_cast<GStore>(&MI)) {
657+
Register PtrBaseReg;
658+
APInt Offset;
659+
LLT StoredValTy = MRI.getType(St->getValueReg());
660+
unsigned ValSize = StoredValTy.getSizeInBits();
661+
if (ValSize < 32 || ValSize != St->getMMO().getSizeInBits())
662+
continue;
663+
664+
Register PtrReg = St->getPointerReg();
665+
if (mi_match(
666+
PtrReg, MRI,
667+
m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
668+
GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg));
669+
StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};
670+
671+
if (Stores.empty()) {
672+
Stores.push_back(New);
673+
continue;
674+
}
675+
676+
// Check if this store is a valid continuation of the sequence.
677+
auto &Last = Stores.back();
678+
if (storeIsValid(Last, New)) {
679+
Stores.push_back(New);
680+
LoadValsSinceLastStore.clear(); // Reset the load value tracking.
681+
} else {
682+
// The store isn't a valid to consider for the prior sequence,
683+
// so try to optimize what we have so far and start a new sequence.
684+
Changed |= tryOptimizeConsecStores(Stores, MIB);
685+
resetState();
686+
Stores.push_back(New);
687+
}
688+
}
689+
} else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
690+
LoadValsSinceLastStore.push_back(Ld->getDstReg());
691+
}
692+
}
693+
Changed |= tryOptimizeConsecStores(Stores, MIB);
694+
resetState();
695+
}
696+
697+
return Changed;
496698
}
497699

498700
char AArch64PostLegalizerCombiner::ID = 0;

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -532,21 +532,15 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
532532
void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
533533
const MCSubtargetInfo &STI,
534534
raw_ostream &O) {
535-
uint8_t Imm = MI->getOperand(OpNo).getImm();
536-
if (Imm != 0) {
537-
O << " wait_vdst:";
538-
printU4ImmDecOperand(MI, OpNo, O);
539-
}
535+
O << " wait_vdst:";
536+
printU4ImmDecOperand(MI, OpNo, O);
540537
}
541538

542539
void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
543540
const MCSubtargetInfo &STI,
544541
raw_ostream &O) {
545-
uint8_t Imm = MI->getOperand(OpNo).getImm();
546-
if (Imm != 0) {
547-
O << " wait_exp:";
548-
printU4ImmDecOperand(MI, OpNo, O);
549-
}
542+
O << " wait_exp:";
543+
printU4ImmDecOperand(MI, OpNo, O);
550544
}
551545

552546
bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,

0 commit comments

Comments
 (0)