Skip to content

Commit 11bd2c5

Browse files
committed
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
1 parent b209c96 commit 11bd2c5

File tree

3 files changed

+167
-139
lines changed

3 files changed

+167
-139
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6765,7 +6765,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
67656765
return SDValue();
67666766
int64_t Offset = C2->getSExtValue();
67676767
switch (Opcode) {
6768-
case ISD::ADD: break;
6768+
case ISD::ADD:
6769+
case ISD::PTRADD:
6770+
break;
67696771
case ISD::SUB: Offset = -uint64_t(Offset); break;
67706772
default: return SDValue();
67716773
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 116 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/CodeGen/MachineFrameInfo.h"
3434
#include "llvm/CodeGen/MachineFunction.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/SDPatternMatch.h"
3637
#include "llvm/IR/DiagnosticInfo.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
4647
#include <optional>
4748

4849
using namespace llvm;
50+
using namespace llvm::SDPatternMatch;
4951

5052
#define DEBUG_TYPE "si-lower"
5153

@@ -14320,7 +14322,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
1432014322
// instead of a tree.
1432114323
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1432214324
DAGCombinerInfo &DCI) const {
14323-
assert(N->getOpcode() == ISD::ADD);
14325+
assert(N->isAnyAdd());
1432414326

1432514327
SelectionDAG &DAG = DCI.DAG;
1432614328
EVT VT = N->getValueType(0);
@@ -14353,7 +14355,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1435314355
for (SDNode *User : LHS->users()) {
1435414356
// There is a use that does not feed into addition, so the multiply can't
1435514357
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14356-
if (User->getOpcode() != ISD::ADD)
14358+
if (!User->isAnyAdd())
1435714359
return SDValue();
1435814360

1435914361
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14465,8 +14467,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
1446514467

1446614468
SDValue Hi = getHiHalf64(LHS, DAG);
1446714469
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14470+
unsigned Opcode = N->getOpcode();
14471+
if (Opcode == ISD::PTRADD)
14472+
Opcode = ISD::ADD;
1446814473
SDValue AddHi =
14469-
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14474+
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
1447014475

1447114476
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
1447214477
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -14940,44 +14945,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1494014945
DAGCombinerInfo &DCI) const {
1494114946
SelectionDAG &DAG = DCI.DAG;
1494214947
SDLoc DL(N);
14948+
EVT VT = N->getValueType(0);
1494314949
SDValue N0 = N->getOperand(0);
1494414950
SDValue N1 = N->getOperand(1);
1494514951

14946-
if (N1.getOpcode() == ISD::ADD) {
14947-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
14948-
// y is not, and (add y, z) is used only once.
14949-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
14950-
// z is not, and (add y, z) is used only once.
14951-
// The goal is to move constant offsets to the outermost ptradd, to create
14952-
// more opportunities to fold offsets into memory instructions.
14953-
// Together with the generic combines in DAGCombiner.cpp, this also
14954-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
14955-
//
14956-
// This transform is here instead of in the general DAGCombiner as it can
14957-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
14958-
// AArch64's CPA.
14959-
SDValue X = N0;
14960-
SDValue Y = N1.getOperand(0);
14961-
SDValue Z = N1.getOperand(1);
14962-
bool N1OneUse = N1.hasOneUse();
14963-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
14964-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
14965-
if ((ZIsConstant != YIsConstant) && N1OneUse) {
14966-
SDNodeFlags Flags;
14967-
// If both additions in the original were NUW, the new ones are as well.
14968-
if (N->getFlags().hasNoUnsignedWrap() &&
14969-
N1->getFlags().hasNoUnsignedWrap())
14970-
Flags |= SDNodeFlags::NoUnsignedWrap;
14971-
14972-
if (YIsConstant)
14973-
std::swap(Y, Z);
14974-
14975-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
14976-
DCI.AddToWorklist(Inner.getNode());
14977-
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
14952+
// The following folds transform PTRADDs into regular arithmetic in cases
14953+
// where the PTRADD wouldn't be folded as an immediate offset into memory
14954+
// instructions anyway. They are target-specific in that other targets might
14955+
// prefer to not lose information about the pointer arithmetic.
14956+
14957+
// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
14958+
// Adapted from DAGCombiner::visitADDLikeCommutative.
14959+
SDValue V, K;
14960+
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
14961+
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
14962+
DCI.AddToWorklist(Inner.getNode());
14963+
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
14964+
}
14965+
14966+
// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
14967+
// performAddCombine.
14968+
if (N1.getOpcode() == ISD::MUL) {
14969+
if (Subtarget->hasMad64_32()) {
14970+
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14971+
return Folded;
14972+
}
14973+
}
14974+
14975+
// If the 32 low bits of the constant are all zero, there is nothing to fold
14976+
// into an immediate offset, so it's better to eliminate the unnecessary
14977+
// addition for the lower 32 bits than to preserve the PTRADD.
14978+
// Analogous to a fold in performAddCombine.
14979+
if (VT == MVT::i64) {
14980+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14981+
return Folded;
14982+
}
14983+
14984+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
14985+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
14986+
// global address GA and constant c, such that c can be folded into GA.
14987+
SDValue GAValue = N0.getOperand(0);
14988+
if (const GlobalAddressSDNode *GA =
14989+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
14990+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14991+
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
14992+
SDNodeFlags Flags;
14993+
// If both additions in the original were NUW, reassociation preserves
14994+
// that.
14995+
if (N->getFlags().hasNoUnsignedWrap() &&
14996+
N0->getFlags().hasNoUnsignedWrap())
14997+
Flags |= SDNodeFlags::NoUnsignedWrap;
14998+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
14999+
DCI.AddToWorklist(Inner.getNode());
15000+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15001+
}
1497815002
}
1497915003
}
1498015004

15005+
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15006+
return SDValue();
15007+
15008+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15009+
// y is not, and (add y, z) is used only once.
15010+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15011+
// z is not, and (add y, z) is used only once.
15012+
// The goal is to move constant offsets to the outermost ptradd, to create
15013+
// more opportunities to fold offsets into memory instructions.
15014+
// Together with the generic combines in DAGCombiner.cpp, this also
15015+
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15016+
//
15017+
// This transform is here instead of in the general DAGCombiner as it can
15018+
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15019+
// AArch64's CPA.
15020+
SDValue X = N0;
15021+
SDValue Y = N1.getOperand(0);
15022+
SDValue Z = N1.getOperand(1);
15023+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15024+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15025+
15026+
SDNodeFlags ReassocFlags;
15027+
// If both additions in the original were NUW, reassociation preserves that.
15028+
if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap())
15029+
ReassocFlags |= SDNodeFlags::NoUnsignedWrap;
15030+
if (ZIsConstant != YIsConstant) {
15031+
15032+
if (YIsConstant)
15033+
std::swap(Y, Z);
15034+
15035+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15036+
DCI.AddToWorklist(Inner.getNode());
15037+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15038+
}
15039+
15040+
// If one of Y and Z is constant, they have been handled above. If both were
15041+
// constant, the addition would have been folded in SelectionDAG::getNode
15042+
// already. This ensures that the generic DAG combines won't undo the
15043+
// following reassociation.
15044+
assert(!YIsConstant && !ZIsConstant);
15045+
15046+
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15047+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15048+
// y are uniform and z isn't.
15049+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15050+
// z are uniform and y isn't.
15051+
// The goal is to push uniform operands up in the computation, so that they
15052+
// can be handled with scalar operations. We can't use reassociateScalarOps
15053+
// for this since it requires two identical commutative operations to
15054+
// reassociate.
15055+
if (Y->isDivergent())
15056+
std::swap(Y, Z);
15057+
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15058+
DCI.AddToWorklist(UniformInner.getNode());
15059+
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15060+
}
15061+
1498115062
return SDValue();
1498215063
}
1498315064

0 commit comments

Comments
 (0)