Skip to content

Commit 32fa47d

Browse files
committed
[NVPTX] Add f32x2 instructions and register class
Also update some test cases to use the autogenerator.
1 parent fe3933d commit 32fa47d

22 files changed

+4411
-1351
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,17 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
471471
// We only care about 16x2 as it's the only real vector type we
472472
// need to deal with.
473473
MVT VT = Vector.getSimpleValueType();
474-
if (!Isv2x16VT(VT))
474+
if (!isPackedVectorTy(VT) || VT.getVectorNumElements() != 2)
475475
return false;
476+
477+
unsigned Opcode;
478+
if (VT.is32BitVector())
479+
Opcode = NVPTX::I32toV2I16;
480+
else if (VT.is64BitVector())
481+
Opcode = NVPTX::I64toV2I32;
482+
else
483+
llvm_unreachable("Unhandled packed type");
484+
476485
// Find and record all uses of this vector that extract element 0 or 1.
477486
SmallVector<SDNode *, 4> E0, E1;
478487
for (auto *U : Vector.getNode()->users()) {
@@ -496,11 +505,11 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
496505
if (E0.empty() || E1.empty())
497506
return false;
498507

499-
// Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
500-
// into f16,f16 SplitF16x2(V)
508+
// Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
509+
// into EltTy,EltTy Split[EltTy]x2(V)
501510
MVT EltVT = VT.getVectorElementType();
502511
SDNode *ScatterOp =
503-
CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
512+
CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
504513
for (auto *Node : E0)
505514
ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
506515
for (auto *Node : E1)
@@ -1035,6 +1044,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
10351044
case MVT::i32:
10361045
case MVT::f32:
10371046
return Opcode_i32;
1047+
case MVT::v2f32:
10381048
case MVT::i64:
10391049
case MVT::f64:
10401050
return Opcode_i64;

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 95 additions & 61 deletions
Large diffs are not rendered by default.

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
151151
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
152152
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
153153
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
154+
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
154155

155156
def True : Predicate<"true">;
156157

@@ -185,6 +186,7 @@ class ValueToRegClass<ValueType T> {
185186
!eq(name, "bf16"): Int16Regs,
186187
!eq(name, "v2bf16"): Int32Regs,
187188
!eq(name, "f32"): Float32Regs,
189+
!eq(name, "v2f32"): Int64Regs,
188190
!eq(name, "f64"): Float64Regs,
189191
!eq(name, "ai32"): Int32ArgRegs,
190192
!eq(name, "ai64"): Int64ArgRegs,
@@ -231,6 +233,7 @@ def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;
231233

232234
def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
233235
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
236+
def F32X2RT : RegTyInfo<v2f32, Int64Regs, ?, ?, supports_imm = 0>;
234237

235238

236239
// This class provides a basic wrapper around an NVPTXInst that abstracts the
@@ -462,6 +465,18 @@ multiclass F3<string op_str, SDPatternOperator op_pat> {
462465
[(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
463466
Requires<[useFP16Math]>;
464467

468+
def f32x2rr_ftz :
469+
BasicNVPTXInst<(outs Int64Regs:$dst),
470+
(ins Int64Regs:$a, Int64Regs:$b),
471+
op_str # ".ftz.f32x2",
472+
[(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>,
473+
Requires<[hasF32x2Instructions, doF32FTZ]>;
474+
def f32x2rr :
475+
BasicNVPTXInst<(outs Int64Regs:$dst),
476+
(ins Int64Regs:$a, Int64Regs:$b),
477+
op_str # ".f32x2",
478+
[(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>,
479+
Requires<[hasF32x2Instructions]>;
465480
def f16x2rr_ftz :
466481
BasicNVPTXInst<(outs Int32Regs:$dst),
467482
(ins Int32Regs:$a, Int32Regs:$b),
@@ -840,6 +855,9 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
840855
(SELP_b32rr $a, $b, $p)>;
841856
}
842857

858+
def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)),
859+
(SELP_b64rr $a, $b, $p)>;
860+
843861
//-----------------------------------
844862
// Test Instructions
845863
//-----------------------------------
@@ -1368,6 +1386,8 @@ defm BFMA16 : FMA<"fma.rn.bf16", BF16RT, [hasBF16Math]>;
13681386
defm BFMA16x2 : FMA<"fma.rn.bf16x2", BF16X2RT, [hasBF16Math]>;
13691387
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", F32RT, [doF32FTZ]>;
13701388
defm FMA32 : FMA<"fma.rn.f32", F32RT>;
1389+
defm FMA32x2_ftz : FMA<"fma.rn.ftz.f32x2", F32X2RT, [hasF32x2Instructions, doF32FTZ]>;
1390+
defm FMA32x2 : FMA<"fma.rn.f32x2", F32X2RT, [hasF32x2Instructions]>;
13711391
defm FMA64 : FMA<"fma.rn.f64", F64RT>;
13721392

13731393
// sin/cos
@@ -2714,6 +2734,7 @@ def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>;
27142734
def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
27152735
(CVT_INREG_s32_s16 $src)>;
27162736

2737+
// Handle extracting one element from the pair (32-bit types)
27172738
foreach vt = [v2f16, v2bf16, v2i16] in {
27182739
def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>;
27192740
def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>;
@@ -2725,10 +2746,21 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
27252746
(V2I16toI32 $a, $b)>;
27262747
}
27272748

2749+
// Same thing for the 64-bit type v2f32.
2750+
foreach vt = [v2f32] in {
2751+
def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>;
2752+
def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>;
2753+
2754+
def : Pat<(extractelt vt:$src, 0), (I64toI32L $src)>;
2755+
def : Pat<(extractelt vt:$src, 1), (I64toI32H $src)>;
2756+
2757+
def : Pat<(vt (build_vector vt.ElementType:$a, vt.ElementType:$b)),
2758+
(V2I32toI64 $a, $b)>;
2759+
}
2760+
27282761
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
27292762
(CVT_u32_u16 $a, CvtNONE)>;
27302763

2731-
27322764
def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>;
27332765

27342766
def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)),

llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4)
6060
def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32,
6161
(add (sequence "R%u", 0, 4),
6262
VRFrame32, VRFrameLocal32)>;
63-
def Int64Regs : NVPTXRegClass<[i64, f64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
63+
def Int64Regs : NVPTXRegClass<[i64, v2f32, f64], 64,
64+
(add (sequence "RL%u", 0, 4),
65+
VRFrame64, VRFrameLocal64)>;
6466
// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
6567
def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
6668

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
116116

117117
return HasTcgen05 && PTXVersion >= 86;
118118
}
119+
// f32x2 instructions in Blackwell family
120+
bool hasF32x2Instructions() const {
121+
return SmVersion >= 100 && PTXVersion >= 86;
122+
}
119123

120124
// TMA G2S copy with cta_group::1/2 support
121125
bool hasCpAsyncBulkTensorCTAGroupSupport() const {

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
129129
Insert = false;
130130
}
131131
}
132-
if (Insert && Isv2x16VT(VT)) {
133-
// Can be built in a single mov
132+
if (Insert && isPackedVectorTy(VT) && VT.is32BitVector()) {
133+
// Can be built in a single 32-bit mov (64-bit regs are emulated in SASS
134+
// with 2x 32-bit regs)
134135
Cost += 1;
135136
Insert = false;
136137
}

llvm/lib/Target/NVPTX/NVPTXUtilities.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,14 @@ inline unsigned promoteScalarArgumentSize(unsigned size) {
8585

8686
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM);
8787

88-
inline bool Isv2x16VT(EVT VT) {
89-
return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16);
88+
inline bool isPackedVectorTy(EVT VT) {
89+
return (VT == MVT::v4i8 || VT == MVT::v2f16 || VT == MVT::v2bf16 ||
90+
VT == MVT::v2i16 || VT == MVT::v2f32);
91+
}
92+
93+
inline bool isPackedElementTy(EVT VT) {
94+
return (VT == MVT::i8 || VT == MVT::f16 || VT == MVT::bf16 ||
95+
VT == MVT::i16 || VT == MVT::f32);
9096
}
9197

9298
inline bool shouldPassAsArray(Type *Ty) {

llvm/test/CodeGen/NVPTX/aggregate-return.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
1010
; CHECK-LABEL: @test_v2f32
1111
%call = tail call <2 x float> @barv(<2 x float> %input)
1212
; CHECK: .param .align 8 .b8 retval0[8];
13-
; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0];
13+
; CHECK: ld.param.b64 [[E0_1:%rd[0-9]+]], [retval0];
14+
; CHECK: mov.b64 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [[E0_1]]
1415
store <2 x float> %call, ptr %output, align 8
1516
; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
1617
ret void
@@ -27,9 +28,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
2728
; CHECK-NOT: ld.param.b32 [[E3:%r[0-9]+]], [retval0+12];
2829
store <3 x float> %call, ptr %output, align 8
2930
; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
30-
; -- This is suboptimal. We should do st.v2.f32 instead
31-
; of combining 2xf32 info i64.
32-
; CHECK-DAG: st.b64 [{{%rd[0-9]}}],
31+
; CHECK-DAG: st.v2.b32 [{{%rd[0-9]}}],
3332
; CHECK: ret;
3433
ret void
3534
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 76 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -707,108 +707,124 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
707707
; SM70: {
708708
; SM70-NEXT: .reg .b16 %rs<9>;
709709
; SM70-NEXT: .reg .b32 %r<21>;
710-
; SM70-NEXT: .reg .b64 %rd<2>;
710+
; SM70-NEXT: .reg .b64 %rd<6>;
711711
; SM70-EMPTY:
712712
; SM70-NEXT: // %bb.0:
713713
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
714714
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
715-
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
716-
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
717-
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
718-
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
719-
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
715+
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
716+
; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
720717
; SM70-NEXT: shl.b32 %r6, %r5, 16;
721-
; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
718+
; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
722719
; SM70-NEXT: shl.b32 %r8, %r7, 16;
723-
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
720+
; SM70-NEXT: mov.b64 %rd2, {%r8, %r6};
721+
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
722+
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
724723
; SM70-NEXT: shl.b32 %r10, %r9, 16;
725-
; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
724+
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
726725
; SM70-NEXT: shl.b32 %r12, %r11, 16;
727-
; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
726+
; SM70-NEXT: mov.b64 %rd3, {%r12, %r10};
727+
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
728+
; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
728729
; SM70-NEXT: shl.b32 %r14, %r13, 16;
729-
; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
730+
; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
730731
; SM70-NEXT: shl.b32 %r16, %r15, 16;
731-
; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
732+
; SM70-NEXT: mov.b64 %rd4, {%r16, %r14};
733+
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
734+
; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
732735
; SM70-NEXT: shl.b32 %r18, %r17, 16;
733-
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
736+
; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
734737
; SM70-NEXT: shl.b32 %r20, %r19, 16;
735-
; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r20, %r18, %r16, %r14};
736-
; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r10, %r8, %r6};
738+
; SM70-NEXT: mov.b64 %rd5, {%r20, %r18};
739+
; SM70-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
740+
; SM70-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
737741
; SM70-NEXT: ret;
738742
;
739743
; SM80-LABEL: test_extload_bf16x8(
740744
; SM80: {
741745
; SM80-NEXT: .reg .b16 %rs<9>;
742746
; SM80-NEXT: .reg .b32 %r<13>;
743-
; SM80-NEXT: .reg .b64 %rd<2>;
747+
; SM80-NEXT: .reg .b64 %rd<6>;
744748
; SM80-EMPTY:
745749
; SM80-NEXT: // %bb.0:
746750
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
747751
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
748-
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
749-
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
750-
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
751-
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
752-
; SM80-NEXT: cvt.f32.bf16 %r5, %rs8;
753-
; SM80-NEXT: cvt.f32.bf16 %r6, %rs7;
754-
; SM80-NEXT: cvt.f32.bf16 %r7, %rs6;
755-
; SM80-NEXT: cvt.f32.bf16 %r8, %rs5;
756-
; SM80-NEXT: cvt.f32.bf16 %r9, %rs4;
757-
; SM80-NEXT: cvt.f32.bf16 %r10, %rs3;
758-
; SM80-NEXT: cvt.f32.bf16 %r11, %rs2;
759-
; SM80-NEXT: cvt.f32.bf16 %r12, %rs1;
760-
; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
761-
; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
752+
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
753+
; SM80-NEXT: cvt.f32.bf16 %r5, %rs2;
754+
; SM80-NEXT: cvt.f32.bf16 %r6, %rs1;
755+
; SM80-NEXT: mov.b64 %rd2, {%r6, %r5};
756+
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
757+
; SM80-NEXT: cvt.f32.bf16 %r7, %rs4;
758+
; SM80-NEXT: cvt.f32.bf16 %r8, %rs3;
759+
; SM80-NEXT: mov.b64 %rd3, {%r8, %r7};
760+
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
761+
; SM80-NEXT: cvt.f32.bf16 %r9, %rs6;
762+
; SM80-NEXT: cvt.f32.bf16 %r10, %rs5;
763+
; SM80-NEXT: mov.b64 %rd4, {%r10, %r9};
764+
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
765+
; SM80-NEXT: cvt.f32.bf16 %r11, %rs8;
766+
; SM80-NEXT: cvt.f32.bf16 %r12, %rs7;
767+
; SM80-NEXT: mov.b64 %rd5, {%r12, %r11};
768+
; SM80-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
769+
; SM80-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
762770
; SM80-NEXT: ret;
763771
;
764772
; SM80-FTZ-LABEL: test_extload_bf16x8(
765773
; SM80-FTZ: {
766774
; SM80-FTZ-NEXT: .reg .b16 %rs<9>;
767775
; SM80-FTZ-NEXT: .reg .b32 %r<13>;
768-
; SM80-FTZ-NEXT: .reg .b64 %rd<2>;
776+
; SM80-FTZ-NEXT: .reg .b64 %rd<6>;
769777
; SM80-FTZ-EMPTY:
770778
; SM80-FTZ-NEXT: // %bb.0:
771779
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
772780
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
773-
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
774-
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
775-
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
776-
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
777-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
778-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
779-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
780-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
781-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
782-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
783-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
784-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
785-
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
786-
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
781+
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
782+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
783+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1;
784+
; SM80-FTZ-NEXT: mov.b64 %rd2, {%r6, %r5};
785+
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
786+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4;
787+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3;
788+
; SM80-FTZ-NEXT: mov.b64 %rd3, {%r8, %r7};
789+
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
790+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6;
791+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5;
792+
; SM80-FTZ-NEXT: mov.b64 %rd4, {%r10, %r9};
793+
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
794+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8;
795+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7;
796+
; SM80-FTZ-NEXT: mov.b64 %rd5, {%r12, %r11};
797+
; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
798+
; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
787799
; SM80-FTZ-NEXT: ret;
788800
;
789801
; SM90-LABEL: test_extload_bf16x8(
790802
; SM90: {
791803
; SM90-NEXT: .reg .b16 %rs<9>;
792804
; SM90-NEXT: .reg .b32 %r<13>;
793-
; SM90-NEXT: .reg .b64 %rd<2>;
805+
; SM90-NEXT: .reg .b64 %rd<6>;
794806
; SM90-EMPTY:
795807
; SM90-NEXT: // %bb.0:
796808
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
797809
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
798-
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
799-
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
800-
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
801-
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
802-
; SM90-NEXT: cvt.f32.bf16 %r5, %rs8;
803-
; SM90-NEXT: cvt.f32.bf16 %r6, %rs7;
804-
; SM90-NEXT: cvt.f32.bf16 %r7, %rs6;
805-
; SM90-NEXT: cvt.f32.bf16 %r8, %rs5;
806-
; SM90-NEXT: cvt.f32.bf16 %r9, %rs4;
807-
; SM90-NEXT: cvt.f32.bf16 %r10, %rs3;
808-
; SM90-NEXT: cvt.f32.bf16 %r11, %rs2;
809-
; SM90-NEXT: cvt.f32.bf16 %r12, %rs1;
810-
; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
811-
; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
810+
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
811+
; SM90-NEXT: cvt.f32.bf16 %r5, %rs2;
812+
; SM90-NEXT: cvt.f32.bf16 %r6, %rs1;
813+
; SM90-NEXT: mov.b64 %rd2, {%r6, %r5};
814+
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
815+
; SM90-NEXT: cvt.f32.bf16 %r7, %rs4;
816+
; SM90-NEXT: cvt.f32.bf16 %r8, %rs3;
817+
; SM90-NEXT: mov.b64 %rd3, {%r8, %r7};
818+
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
819+
; SM90-NEXT: cvt.f32.bf16 %r9, %rs6;
820+
; SM90-NEXT: cvt.f32.bf16 %r10, %rs5;
821+
; SM90-NEXT: mov.b64 %rd4, {%r10, %r9};
822+
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
823+
; SM90-NEXT: cvt.f32.bf16 %r11, %rs8;
824+
; SM90-NEXT: cvt.f32.bf16 %r12, %rs7;
825+
; SM90-NEXT: mov.b64 %rd5, {%r12, %r11};
826+
; SM90-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
827+
; SM90-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
812828
; SM90-NEXT: ret;
813829
%load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
814830
%res = fpext <8 x bfloat> %load to <8 x float>

0 commit comments

Comments
 (0)