Skip to content

Commit 7eb2283

Browse files
committed
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
1 parent a448f7e commit 7eb2283

File tree

3 files changed

+167
-139
lines changed

3 files changed

+167
-139
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
67066706
return SDValue();
67076707
int64_t Offset = C2->getSExtValue();
67086708
switch (Opcode) {
6709-
case ISD::ADD: break;
6709+
case ISD::ADD:
6710+
case ISD::PTRADD:
6711+
break;
67106712
case ISD::SUB: Offset = -uint64_t(Offset); break;
67116713
default: return SDValue();
67126714
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 116 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/CodeGen/MachineFrameInfo.h"
3434
#include "llvm/CodeGen/MachineFunction.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/SDPatternMatch.h"
3637
#include "llvm/IR/DiagnosticInfo.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
4647
#include <optional>
4748

4849
using namespace llvm;
50+
using namespace llvm::SDPatternMatch;
4951

5052
#define DEBUG_TYPE "si-lower"
5153

@@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
1432914331
// instead of a tree.
1433014332
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1433114333
DAGCombinerInfo &DCI) const {
14332-
assert(N->getOpcode() == ISD::ADD);
14334+
assert(N->isAnyAdd());
1433314335

1433414336
SelectionDAG &DAG = DCI.DAG;
1433514337
EVT VT = N->getValueType(0);
@@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1436214364
for (SDNode *User : LHS->users()) {
1436314365
// There is a use that does not feed into addition, so the multiply can't
1436414366
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14365-
if (User->getOpcode() != ISD::ADD)
14367+
if (!User->isAnyAdd())
1436614368
return SDValue();
1436714369

1436814370
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
1447414476

1447514477
SDValue Hi = getHiHalf64(LHS, DAG);
1447614478
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14479+
unsigned Opcode = N->getOpcode();
14480+
if (Opcode == ISD::PTRADD)
14481+
Opcode = ISD::ADD;
1447714482
SDValue AddHi =
14478-
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14483+
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
1447914484

1448014485
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
1448114486
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1494914954
DAGCombinerInfo &DCI) const {
1495014955
SelectionDAG &DAG = DCI.DAG;
1495114956
SDLoc DL(N);
14957+
EVT VT = N->getValueType(0);
1495214958
SDValue N0 = N->getOperand(0);
1495314959
SDValue N1 = N->getOperand(1);
1495414960

14955-
if (N1.getOpcode() == ISD::ADD) {
14956-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
14957-
// y is not, and (add y, z) is used only once.
14958-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
14959-
// z is not, and (add y, z) is used only once.
14960-
// The goal is to move constant offsets to the outermost ptradd, to create
14961-
// more opportunities to fold offsets into memory instructions.
14962-
// Together with the generic combines in DAGCombiner.cpp, this also
14963-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
14964-
//
14965-
// This transform is here instead of in the general DAGCombiner as it can
14966-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
14967-
// AArch64's CPA.
14968-
SDValue X = N0;
14969-
SDValue Y = N1.getOperand(0);
14970-
SDValue Z = N1.getOperand(1);
14971-
bool N1OneUse = N1.hasOneUse();
14972-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
14973-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
14974-
if ((ZIsConstant != YIsConstant) && N1OneUse) {
14975-
SDNodeFlags Flags;
14976-
// If both additions in the original were NUW, the new ones are as well.
14977-
if (N->getFlags().hasNoUnsignedWrap() &&
14978-
N1->getFlags().hasNoUnsignedWrap())
14979-
Flags |= SDNodeFlags::NoUnsignedWrap;
14980-
14981-
if (YIsConstant)
14982-
std::swap(Y, Z);
14983-
14984-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
14985-
DCI.AddToWorklist(Inner.getNode());
14986-
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
14961+
// The following folds transform PTRADDs into regular arithmetic in cases
14962+
// where the PTRADD wouldn't be folded as an immediate offset into memory
14963+
// instructions anyway. They are target-specific in that other targets might
14964+
// prefer to not lose information about the pointer arithmetic.
14965+
14966+
// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
14967+
// Adapted from DAGCombiner::visitADDLikeCommutative.
14968+
SDValue V, K;
14969+
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
14970+
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
14971+
DCI.AddToWorklist(Inner.getNode());
14972+
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
14973+
}
14974+
14975+
// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
14976+
// performAddCombine.
14977+
if (N1.getOpcode() == ISD::MUL) {
14978+
if (Subtarget->hasMad64_32()) {
14979+
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14980+
return Folded;
14981+
}
14982+
}
14983+
14984+
// If the 32 low bits of the constant are all zero, there is nothing to fold
14985+
// into an immediate offset, so it's better to eliminate the unnecessary
14986+
// addition for the lower 32 bits than to preserve the PTRADD.
14987+
// Analogous to a fold in performAddCombine.
14988+
if (VT == MVT::i64) {
14989+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14990+
return Folded;
14991+
}
14992+
14993+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
14994+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
14995+
// global address GA and constant c, such that c can be folded into GA.
14996+
SDValue GAValue = N0.getOperand(0);
14997+
if (const GlobalAddressSDNode *GA =
14998+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
14999+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15000+
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
15001+
SDNodeFlags Flags;
15002+
// If both additions in the original were NUW, reassociation preserves
15003+
// that.
15004+
if (N->getFlags().hasNoUnsignedWrap() &&
15005+
N0->getFlags().hasNoUnsignedWrap())
15006+
Flags |= SDNodeFlags::NoUnsignedWrap;
15007+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15008+
DCI.AddToWorklist(Inner.getNode());
15009+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15010+
}
1498715011
}
1498815012
}
1498915013

15014+
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15015+
return SDValue();
15016+
15017+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15018+
// y is not, and (add y, z) is used only once.
15019+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15020+
// z is not, and (add y, z) is used only once.
15021+
// The goal is to move constant offsets to the outermost ptradd, to create
15022+
// more opportunities to fold offsets into memory instructions.
15023+
// Together with the generic combines in DAGCombiner.cpp, this also
15024+
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15025+
//
15026+
// This transform is here instead of in the general DAGCombiner as it can
15027+
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15028+
// AArch64's CPA.
15029+
SDValue X = N0;
15030+
SDValue Y = N1.getOperand(0);
15031+
SDValue Z = N1.getOperand(1);
15032+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15033+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15034+
15035+
SDNodeFlags ReassocFlags;
15036+
// If both additions in the original were NUW, reassociation preserves that.
15037+
if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap())
15038+
ReassocFlags |= SDNodeFlags::NoUnsignedWrap;
15039+
if (ZIsConstant != YIsConstant) {
15040+
15041+
if (YIsConstant)
15042+
std::swap(Y, Z);
15043+
15044+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15045+
DCI.AddToWorklist(Inner.getNode());
15046+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15047+
}
15048+
15049+
// If one of Y and Z is constant, they have been handled above. If both were
15050+
// constant, the addition would have been folded in SelectionDAG::getNode
15051+
// already. This ensures that the generic DAG combines won't undo the
15052+
// following reassociation.
15053+
assert(!YIsConstant && !ZIsConstant);
15054+
15055+
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15056+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15057+
// y are uniform and z isn't.
15058+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15059+
// z are uniform and y isn't.
15060+
// The goal is to push uniform operands up in the computation, so that they
15061+
// can be handled with scalar operations. We can't use reassociateScalarOps
15062+
// for this since it requires two identical commutative operations to
15063+
// reassociate.
15064+
if (Y->isDivergent())
15065+
std::swap(Y, Z);
15066+
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15067+
DCI.AddToWorklist(UniformInner.getNode());
15068+
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15069+
}
15070+
1499015071
return SDValue();
1499115072
}
1499215073

0 commit comments

Comments
 (0)