|
33 | 33 | #include "llvm/CodeGen/MachineFrameInfo.h"
|
34 | 34 | #include "llvm/CodeGen/MachineFunction.h"
|
35 | 35 | #include "llvm/CodeGen/MachineLoopInfo.h"
|
| 36 | +#include "llvm/CodeGen/SDPatternMatch.h" |
36 | 37 | #include "llvm/IR/DiagnosticInfo.h"
|
37 | 38 | #include "llvm/IR/IRBuilder.h"
|
38 | 39 | #include "llvm/IR/IntrinsicInst.h"
|
|
46 | 47 | #include <optional>
|
47 | 48 |
|
48 | 49 | using namespace llvm;
|
| 50 | +using namespace llvm::SDPatternMatch; |
49 | 51 |
|
50 | 52 | #define DEBUG_TYPE "si-lower"
|
51 | 53 |
|
@@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
|
14329 | 14331 | // instead of a tree.
|
14330 | 14332 | SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
|
14331 | 14333 | DAGCombinerInfo &DCI) const {
|
14332 |
| - assert(N->getOpcode() == ISD::ADD); |
| 14334 | + assert(N->isAnyAdd()); |
14333 | 14335 |
|
14334 | 14336 | SelectionDAG &DAG = DCI.DAG;
|
14335 | 14337 | EVT VT = N->getValueType(0);
|
@@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
|
14362 | 14364 | for (SDNode *User : LHS->users()) {
|
14363 | 14365 | // There is a use that does not feed into addition, so the multiply can't
|
14364 | 14366 | // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
|
14365 |
| - if (User->getOpcode() != ISD::ADD) |
| 14367 | + if (!User->isAnyAdd()) |
14366 | 14368 | return SDValue();
|
14367 | 14369 |
|
14368 | 14370 | // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
|
@@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
|
14474 | 14476 |
|
14475 | 14477 | SDValue Hi = getHiHalf64(LHS, DAG);
|
14476 | 14478 | SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
|
| 14479 | + unsigned Opcode = N->getOpcode(); |
| 14480 | + if (Opcode == ISD::PTRADD) |
| 14481 | + Opcode = ISD::ADD; |
14477 | 14482 | SDValue AddHi =
|
14478 |
| - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); |
| 14483 | + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); |
14479 | 14484 |
|
14480 | 14485 | SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
|
14481 | 14486 | return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
|
@@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
|
14949 | 14954 | DAGCombinerInfo &DCI) const {
|
14950 | 14955 | SelectionDAG &DAG = DCI.DAG;
|
14951 | 14956 | SDLoc DL(N);
|
| 14957 | + EVT VT = N->getValueType(0); |
14952 | 14958 | SDValue N0 = N->getOperand(0);
|
14953 | 14959 | SDValue N1 = N->getOperand(1);
|
14954 | 14960 |
|
14955 |
| - if (N1.getOpcode() == ISD::ADD) { |
14956 |
| - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, |
14957 |
| - // y is not, and (add y, z) is used only once. |
14958 |
| - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, |
14959 |
| - // z is not, and (add y, z) is used only once. |
14960 |
| - // The goal is to move constant offsets to the outermost ptradd, to create |
14961 |
| - // more opportunities to fold offsets into memory instructions. |
14962 |
| - // Together with the generic combines in DAGCombiner.cpp, this also |
14963 |
| - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). |
14964 |
| - // |
14965 |
| - // This transform is here instead of in the general DAGCombiner as it can |
14966 |
| - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for |
14967 |
| - // AArch64's CPA. |
14968 |
| - SDValue X = N0; |
14969 |
| - SDValue Y = N1.getOperand(0); |
14970 |
| - SDValue Z = N1.getOperand(1); |
14971 |
| - bool N1OneUse = N1.hasOneUse(); |
14972 |
| - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); |
14973 |
| - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); |
14974 |
| - if ((ZIsConstant != YIsConstant) && N1OneUse) { |
14975 |
| - SDNodeFlags Flags; |
14976 |
| - // If both additions in the original were NUW, the new ones are as well. |
14977 |
| - if (N->getFlags().hasNoUnsignedWrap() && |
14978 |
| - N1->getFlags().hasNoUnsignedWrap()) |
14979 |
| - Flags |= SDNodeFlags::NoUnsignedWrap; |
14980 |
| - |
14981 |
| - if (YIsConstant) |
14982 |
| - std::swap(Y, Z); |
14983 |
| - |
14984 |
| - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); |
14985 |
| - DCI.AddToWorklist(Inner.getNode()); |
14986 |
| - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); |
| 14961 | + // The following folds transform PTRADDs into regular arithmetic in cases |
| 14962 | + // where the PTRADD wouldn't be folded as an immediate offset into memory |
| 14963 | + // instructions anyway. They are target-specific in that other targets might |
| 14964 | + // prefer to not lose information about the pointer arithmetic. |
| 14965 | + |
| 14966 | + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). |
| 14967 | + // Adapted from DAGCombiner::visitADDLikeCommutative. |
| 14968 | + SDValue V, K; |
| 14969 | + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { |
| 14970 | + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K); |
| 14971 | + DCI.AddToWorklist(Inner.getNode()); |
| 14972 | + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); |
| 14973 | + } |
| 14974 | + |
| 14975 | + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in |
| 14976 | + // performAddCombine. |
| 14977 | + if (N1.getOpcode() == ISD::MUL) { |
| 14978 | + if (Subtarget->hasMad64_32()) { |
| 14979 | + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) |
| 14980 | + return Folded; |
| 14981 | + } |
| 14982 | + } |
| 14983 | + |
| 14984 | + // If the 32 low bits of the constant are all zero, there is nothing to fold |
| 14985 | + // into an immediate offset, so it's better to eliminate the unnecessary |
| 14986 | + // addition for the lower 32 bits than to preserve the PTRADD. |
| 14987 | + // Analogous to a fold in performAddCombine. |
| 14988 | + if (VT == MVT::i64) { |
| 14989 | + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) |
| 14990 | + return Folded; |
| 14991 | + } |
| 14992 | + |
| 14993 | + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { |
| 14994 | + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with |
| 14995 | + // global address GA and constant c, such that c can be folded into GA. |
| 14996 | + SDValue GAValue = N0.getOperand(0); |
| 14997 | + if (const GlobalAddressSDNode *GA = |
| 14998 | + dyn_cast<GlobalAddressSDNode>(GAValue)) { |
| 14999 | + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| 15000 | + if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) { |
| 15001 | + SDNodeFlags Flags; |
| 15002 | + // If both additions in the original were NUW, reassociation preserves |
| 15003 | + // that. |
| 15004 | + if (N->getFlags().hasNoUnsignedWrap() && |
| 15005 | + N0->getFlags().hasNoUnsignedWrap()) |
| 15006 | + Flags |= SDNodeFlags::NoUnsignedWrap; |
| 15007 | + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); |
| 15008 | + DCI.AddToWorklist(Inner.getNode()); |
| 15009 | + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); |
| 15010 | + } |
14987 | 15011 | }
|
14988 | 15012 | }
|
14989 | 15013 |
|
| 15014 | + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) |
| 15015 | + return SDValue(); |
| 15016 | + |
| 15017 | + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, |
| 15018 | + // y is not, and (add y, z) is used only once. |
| 15019 | + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, |
| 15020 | + // z is not, and (add y, z) is used only once. |
| 15021 | + // The goal is to move constant offsets to the outermost ptradd, to create |
| 15022 | + // more opportunities to fold offsets into memory instructions. |
| 15023 | + // Together with the generic combines in DAGCombiner.cpp, this also |
| 15024 | + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). |
| 15025 | + // |
| 15026 | + // This transform is here instead of in the general DAGCombiner as it can |
| 15027 | + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for |
| 15028 | + // AArch64's CPA. |
| 15029 | + SDValue X = N0; |
| 15030 | + SDValue Y = N1.getOperand(0); |
| 15031 | + SDValue Z = N1.getOperand(1); |
| 15032 | + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); |
| 15033 | + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); |
| 15034 | + |
| 15035 | + SDNodeFlags ReassocFlags; |
| 15036 | + // If both additions in the original were NUW, reassociation preserves that. |
| 15037 | + if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap()) |
| 15038 | + ReassocFlags |= SDNodeFlags::NoUnsignedWrap; |
| 15039 | + if (ZIsConstant != YIsConstant) { |
| 15040 | + |
| 15041 | + if (YIsConstant) |
| 15042 | + std::swap(Y, Z); |
| 15043 | + |
| 15044 | + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); |
| 15045 | + DCI.AddToWorklist(Inner.getNode()); |
| 15046 | + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); |
| 15047 | + } |
| 15048 | + |
| 15049 | + // If one of Y and Z is constant, they have been handled above. If both were |
| 15050 | + // constant, the addition would have been folded in SelectionDAG::getNode |
| 15051 | + // already. This ensures that the generic DAG combines won't undo the |
| 15052 | + // following reassociation. |
| 15053 | + assert(!YIsConstant && !ZIsConstant); |
| 15054 | + |
| 15055 | + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { |
| 15056 | + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and |
| 15057 | + // y are uniform and z isn't. |
| 15058 | + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and |
| 15059 | + // z are uniform and y isn't. |
| 15060 | + // The goal is to push uniform operands up in the computation, so that they |
| 15061 | + // can be handled with scalar operations. We can't use reassociateScalarOps |
| 15062 | + // for this since it requires two identical commutative operations to |
| 15063 | + // reassociate. |
| 15064 | + if (Y->isDivergent()) |
| 15065 | + std::swap(Y, Z); |
| 15066 | + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); |
| 15067 | + DCI.AddToWorklist(UniformInner.getNode()); |
| 15068 | + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); |
| 15069 | + } |
| 15070 | + |
14990 | 15071 | return SDValue();
|
14991 | 15072 | }
|
14992 | 15073 |
|
|
0 commit comments