Skip to content

[AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns #143881

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/ritter-x2a/06-12-_amdgpu_sdag_test_isd_ptradd_handling_in_vop3_patterns
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
} // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts

class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2, bit op1IsRight = 0> : PatFrag<
(ops node:$x, node:$y, node:$z),
// When the inner operation is used multiple times, selecting 3-op
// instructions may still be beneficial -- if the other users can be
// combined similarly. Let's be conservative for now.
(op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
!if(op1IsRight, (op2 node:$z, (HasOneUseBinOp<op1> node:$x, node:$y)),
(op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z)),
[{
// Only use VALU ops when the result is divergent.
if (!N->isDivergent())
Expand All @@ -516,7 +517,10 @@ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
let PredicateCodeUsesOperands = 1;
}

class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and
// matches (op2 z, (op1, x, y)) if op1IsRight = 1.
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2,
bit op1IsRight = 0> : ThreeOpFragSDAG<op1, op2, op1IsRight> {
// The divergence predicate is irrelevant in GlobalISel, as we have
// proper register bank checks. We just need to verify the constant
// bus restriction when all the sources are considered.
Expand Down Expand Up @@ -806,12 +810,19 @@ def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;

let SubtargetPredicate = isGFX940Plus in
let SubtargetPredicate = isGFX940Plus in {
def : GCNPat<
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;

def : GCNPat <
// (ptradd z, (shl x, y)) -> ((x << y) + z)
(ThreeOpFrag<shl_0_to_4, ptradd, /*op1IsRight=*/1> i64:$src0, i32:$src1, i64:$src2),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;
} // End SubtargetPredicate = isGFX940Plus

def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;

Expand Down Expand Up @@ -880,19 +891,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {

// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
// We need to separate this because otherwise OtherPredicates would be overriden.
class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
(inst $src0, $src1, $src2, 0 /* clamp */)
>;
class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp, bit mulIsRight = 0> : GCNPat <
!if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)))),
(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))),
(inst $src0, $src1, $src2, 0 /* clamp */)>;

multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
def : IMAD32_Mul24_Pats_Impl<inst, add>;
def : IMAD32_Mul24_Pats_Impl<inst, ptradd, /*mulIsRight=*/1>;
}

// exclude pre-GFX9 where it was slow
let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
}
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_gfx11_e64>;
}

def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
Expand Down
41 changes: 12 additions & 29 deletions llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
Original file line number Diff line number Diff line change
Expand Up @@ -266,42 +266,25 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {

; Use non-zero shift amounts in v_lshl_add_u64.
define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
; GFX942_PTRADD-LABEL: select_v_lshl_add_u64:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: select_v_lshl_add_u64:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: select_v_lshl_add_u64:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i64, ptr %base, i64 %voffset
ret ptr %gep
}

; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
; mul into a mul24.
define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
; GFX942_PTRADD-LABEL: fold_mul24_into_mad:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2
; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: fold_mul24_into_mad:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2
; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: fold_mul24_into_mad:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2
; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
%a_masked = and i64 %a, u0xfffff
%b_masked = and i64 %b, u0xfffff
%mul = mul i64 %a_masked, %b_masked
Expand Down
42 changes: 14 additions & 28 deletions llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_PTRADD-LABEL: gep_as0:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: gep_as0:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: gep_as0:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: gep_as0:
; GFX10: ; %bb.0: ; %entry
Expand Down Expand Up @@ -187,20 +179,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_PTRADD-LABEL: multi_gep_as0:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: multi_gep_as0:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: multi_gep_as0:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: multi_gep_as0:
; GFX10: ; %bb.0: ; %entry
Expand Down Expand Up @@ -535,3 +519,5 @@ entry:
; GFX12_PTRADD: {{.*}}
; GFX8_LEGACY: {{.*}}
; GFX8_PTRADD: {{.*}}
; GFX942_LEGACY: {{.*}}
; GFX942_PTRADD: {{.*}}