|
33 | 33 | #include "llvm/CodeGen/MachineFrameInfo.h"
|
34 | 34 | #include "llvm/CodeGen/MachineFunction.h"
|
35 | 35 | #include "llvm/CodeGen/MachineLoopInfo.h"
|
| 36 | +#include "llvm/CodeGen/SDPatternMatch.h" |
36 | 37 | #include "llvm/IR/DiagnosticInfo.h"
|
37 | 38 | #include "llvm/IR/IRBuilder.h"
|
38 | 39 | #include "llvm/IR/IntrinsicInst.h"
|
|
46 | 47 | #include <optional>
|
47 | 48 |
|
48 | 49 | using namespace llvm;
|
| 50 | +using namespace llvm::SDPatternMatch; |
49 | 51 |
|
50 | 52 | #define DEBUG_TYPE "si-lower"
|
51 | 53 |
|
@@ -14320,7 +14322,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
|
14320 | 14322 | // instead of a tree.
|
14321 | 14323 | SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
|
14322 | 14324 | DAGCombinerInfo &DCI) const {
|
14323 |
| - assert(N->getOpcode() == ISD::ADD); |
| 14325 | + assert(N->isAnyAdd()); |
14324 | 14326 |
|
14325 | 14327 | SelectionDAG &DAG = DCI.DAG;
|
14326 | 14328 | EVT VT = N->getValueType(0);
|
@@ -14353,7 +14355,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
|
14353 | 14355 | for (SDNode *User : LHS->users()) {
|
14354 | 14356 | // There is a use that does not feed into addition, so the multiply can't
|
14355 | 14357 | // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
|
14356 |
| - if (User->getOpcode() != ISD::ADD) |
| 14358 | + if (!User->isAnyAdd()) |
14357 | 14359 | return SDValue();
|
14358 | 14360 |
|
14359 | 14361 | // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
|
@@ -14465,8 +14467,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
|
14465 | 14467 |
|
14466 | 14468 | SDValue Hi = getHiHalf64(LHS, DAG);
|
14467 | 14469 | SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
|
| 14470 | + unsigned Opcode = N->getOpcode(); |
| 14471 | + if (Opcode == ISD::PTRADD) |
| 14472 | + Opcode = ISD::ADD; |
14468 | 14473 | SDValue AddHi =
|
14469 |
| - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); |
| 14474 | + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); |
14470 | 14475 |
|
14471 | 14476 | SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
|
14472 | 14477 | return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
|
@@ -14940,44 +14945,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
|
14940 | 14945 | DAGCombinerInfo &DCI) const {
|
14941 | 14946 | SelectionDAG &DAG = DCI.DAG;
|
14942 | 14947 | SDLoc DL(N);
|
| 14948 | + EVT VT = N->getValueType(0); |
14943 | 14949 | SDValue N0 = N->getOperand(0);
|
14944 | 14950 | SDValue N1 = N->getOperand(1);
|
14945 | 14951 |
|
14946 |
| - if (N1.getOpcode() == ISD::ADD) { |
14947 |
| - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, |
14948 |
| - // y is not, and (add y, z) is used only once. |
14949 |
| - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, |
14950 |
| - // z is not, and (add y, z) is used only once. |
14951 |
| - // The goal is to move constant offsets to the outermost ptradd, to create |
14952 |
| - // more opportunities to fold offsets into memory instructions. |
14953 |
| - // Together with the generic combines in DAGCombiner.cpp, this also |
14954 |
| - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). |
14955 |
| - // |
14956 |
| - // This transform is here instead of in the general DAGCombiner as it can |
14957 |
| - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for |
14958 |
| - // AArch64's CPA. |
14959 |
| - SDValue X = N0; |
14960 |
| - SDValue Y = N1.getOperand(0); |
14961 |
| - SDValue Z = N1.getOperand(1); |
14962 |
| - bool N1OneUse = N1.hasOneUse(); |
14963 |
| - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); |
14964 |
| - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); |
14965 |
| - if ((ZIsConstant != YIsConstant) && N1OneUse) { |
14966 |
| - SDNodeFlags Flags; |
14967 |
| - // If both additions in the original were NUW, the new ones are as well. |
14968 |
| - if (N->getFlags().hasNoUnsignedWrap() && |
14969 |
| - N1->getFlags().hasNoUnsignedWrap()) |
14970 |
| - Flags |= SDNodeFlags::NoUnsignedWrap; |
14971 |
| - |
14972 |
| - if (YIsConstant) |
14973 |
| - std::swap(Y, Z); |
14974 |
| - |
14975 |
| - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); |
14976 |
| - DCI.AddToWorklist(Inner.getNode()); |
14977 |
| - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); |
| 14952 | + // The following folds transform PTRADDs into regular arithmetic in cases |
| 14953 | + // where the PTRADD wouldn't be folded as an immediate offset into memory |
| 14954 | + // instructions anyway. They are target-specific in that other targets might |
| 14955 | + // prefer to not lose information about the pointer arithmetic. |
| 14956 | + |
| 14957 | + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). |
| 14958 | + // Adapted from DAGCombiner::visitADDLikeCommutative. |
| 14959 | + SDValue V, K; |
| 14960 | + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { |
| 14961 | + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K); |
| 14962 | + DCI.AddToWorklist(Inner.getNode()); |
| 14963 | + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); |
| 14964 | + } |
| 14965 | + |
| 14966 | + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in |
| 14967 | + // performAddCombine. |
| 14968 | + if (N1.getOpcode() == ISD::MUL) { |
| 14969 | + if (Subtarget->hasMad64_32()) { |
| 14970 | + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) |
| 14971 | + return Folded; |
| 14972 | + } |
| 14973 | + } |
| 14974 | + |
| 14975 | + // If the 32 low bits of the constant are all zero, there is nothing to fold |
| 14976 | + // into an immediate offset, so it's better to eliminate the unnecessary |
| 14977 | + // addition for the lower 32 bits than to preserve the PTRADD. |
| 14978 | + // Analogous to a fold in performAddCombine. |
| 14979 | + if (VT == MVT::i64) { |
| 14980 | + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) |
| 14981 | + return Folded; |
| 14982 | + } |
| 14983 | + |
| 14984 | + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { |
| 14985 | + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with |
| 14986 | + // global address GA and constant c, such that c can be folded into GA. |
| 14987 | + SDValue GAValue = N0.getOperand(0); |
| 14988 | + if (const GlobalAddressSDNode *GA = |
| 14989 | + dyn_cast<GlobalAddressSDNode>(GAValue)) { |
| 14990 | + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| 14991 | + if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) { |
| 14992 | + SDNodeFlags Flags; |
| 14993 | + // If both additions in the original were NUW, reassociation preserves |
| 14994 | + // that. |
| 14995 | + if (N->getFlags().hasNoUnsignedWrap() && |
| 14996 | + N0->getFlags().hasNoUnsignedWrap()) |
| 14997 | + Flags |= SDNodeFlags::NoUnsignedWrap; |
| 14998 | + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); |
| 14999 | + DCI.AddToWorklist(Inner.getNode()); |
| 15000 | + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); |
| 15001 | + } |
14978 | 15002 | }
|
14979 | 15003 | }
|
14980 | 15004 |
|
| 15005 | + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) |
| 15006 | + return SDValue(); |
| 15007 | + |
| 15008 | + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, |
| 15009 | + // y is not, and (add y, z) is used only once. |
| 15010 | + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, |
| 15011 | + // z is not, and (add y, z) is used only once. |
| 15012 | + // The goal is to move constant offsets to the outermost ptradd, to create |
| 15013 | + // more opportunities to fold offsets into memory instructions. |
| 15014 | + // Together with the generic combines in DAGCombiner.cpp, this also |
| 15015 | + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). |
| 15016 | + // |
| 15017 | + // This transform is here instead of in the general DAGCombiner as it can |
| 15018 | + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for |
| 15019 | + // AArch64's CPA. |
| 15020 | + SDValue X = N0; |
| 15021 | + SDValue Y = N1.getOperand(0); |
| 15022 | + SDValue Z = N1.getOperand(1); |
| 15023 | + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); |
| 15024 | + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); |
| 15025 | + |
| 15026 | + SDNodeFlags ReassocFlags; |
| 15027 | + // If both additions in the original were NUW, reassociation preserves that. |
| 15028 | + if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap()) |
| 15029 | + ReassocFlags |= SDNodeFlags::NoUnsignedWrap; |
| 15030 | + if (ZIsConstant != YIsConstant) { |
| 15031 | + |
| 15032 | + if (YIsConstant) |
| 15033 | + std::swap(Y, Z); |
| 15034 | + |
| 15035 | + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); |
| 15036 | + DCI.AddToWorklist(Inner.getNode()); |
| 15037 | + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); |
| 15038 | + } |
| 15039 | + |
| 15040 | + // If one of Y and Z is constant, they have been handled above. If both were |
| 15041 | + // constant, the addition would have been folded in SelectionDAG::getNode |
| 15042 | + // already. This ensures that the generic DAG combines won't undo the |
| 15043 | + // following reassociation. |
| 15044 | + assert(!YIsConstant && !ZIsConstant); |
| 15045 | + |
| 15046 | + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { |
| 15047 | + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and |
| 15048 | + // y are uniform and z isn't. |
| 15049 | + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and |
| 15050 | + // z are uniform and y isn't. |
| 15051 | + // The goal is to push uniform operands up in the computation, so that they |
| 15052 | + // can be handled with scalar operations. We can't use reassociateScalarOps |
| 15053 | + // for this since it requires two identical commutative operations to |
| 15054 | + // reassociate. |
| 15055 | + if (Y->isDivergent()) |
| 15056 | + std::swap(Y, Z); |
| 15057 | + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); |
| 15058 | + DCI.AddToWorklist(UniformInner.getNode()); |
| 15059 | + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); |
| 15060 | + } |
| 15061 | + |
14981 | 15062 | return SDValue();
|
14982 | 15063 | }
|
14983 | 15064 |
|
|
0 commit comments