@@ -707,108 +707,124 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
707
707
; SM70: {
708
708
; SM70-NEXT: .reg .b16 %rs<9>;
709
709
; SM70-NEXT: .reg .b32 %r<21>;
710
- ; SM70-NEXT: .reg .b64 %rd<2 >;
710
+ ; SM70-NEXT: .reg .b64 %rd<6 >;
711
711
; SM70-EMPTY:
712
712
; SM70-NEXT: // %bb.0:
713
713
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
714
714
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
715
- ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
716
- ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
717
- ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
718
- ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
719
- ; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
715
+ ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
716
+ ; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
720
717
; SM70-NEXT: shl.b32 %r6, %r5, 16;
721
- ; SM70-NEXT: cvt.u32.u16 %r7, %rs7 ;
718
+ ; SM70-NEXT: cvt.u32.u16 %r7, %rs1 ;
722
719
; SM70-NEXT: shl.b32 %r8, %r7, 16;
723
- ; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
720
+ ; SM70-NEXT: mov.b64 %rd2, {%r8, %r6};
721
+ ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
722
+ ; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
724
723
; SM70-NEXT: shl.b32 %r10, %r9, 16;
725
- ; SM70-NEXT: cvt.u32.u16 %r11, %rs5 ;
724
+ ; SM70-NEXT: cvt.u32.u16 %r11, %rs3 ;
726
725
; SM70-NEXT: shl.b32 %r12, %r11, 16;
727
- ; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
726
+ ; SM70-NEXT: mov.b64 %rd3, {%r12, %r10};
727
+ ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
728
+ ; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
728
729
; SM70-NEXT: shl.b32 %r14, %r13, 16;
729
- ; SM70-NEXT: cvt.u32.u16 %r15, %rs3 ;
730
+ ; SM70-NEXT: cvt.u32.u16 %r15, %rs5 ;
730
731
; SM70-NEXT: shl.b32 %r16, %r15, 16;
731
- ; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
732
+ ; SM70-NEXT: mov.b64 %rd4, {%r16, %r14};
733
+ ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
734
+ ; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
732
735
; SM70-NEXT: shl.b32 %r18, %r17, 16;
733
- ; SM70-NEXT: cvt.u32.u16 %r19, %rs1 ;
736
+ ; SM70-NEXT: cvt.u32.u16 %r19, %rs7 ;
734
737
; SM70-NEXT: shl.b32 %r20, %r19, 16;
735
- ; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r20, %r18, %r16, %r14};
736
- ; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r10, %r8, %r6};
738
+ ; SM70-NEXT: mov.b64 %rd5, {%r20, %r18};
739
+ ; SM70-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
740
+ ; SM70-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
737
741
; SM70-NEXT: ret;
738
742
;
739
743
; SM80-LABEL: test_extload_bf16x8(
740
744
; SM80: {
741
745
; SM80-NEXT: .reg .b16 %rs<9>;
742
746
; SM80-NEXT: .reg .b32 %r<13>;
743
- ; SM80-NEXT: .reg .b64 %rd<2 >;
747
+ ; SM80-NEXT: .reg .b64 %rd<6 >;
744
748
; SM80-EMPTY:
745
749
; SM80-NEXT: // %bb.0:
746
750
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
747
751
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
748
- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
749
- ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
750
- ; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
751
- ; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
752
- ; SM80-NEXT: cvt.f32.bf16 %r5, %rs8;
753
- ; SM80-NEXT: cvt.f32.bf16 %r6, %rs7;
754
- ; SM80-NEXT: cvt.f32.bf16 %r7, %rs6;
755
- ; SM80-NEXT: cvt.f32.bf16 %r8, %rs5;
756
- ; SM80-NEXT: cvt.f32.bf16 %r9, %rs4;
757
- ; SM80-NEXT: cvt.f32.bf16 %r10, %rs3;
758
- ; SM80-NEXT: cvt.f32.bf16 %r11, %rs2;
759
- ; SM80-NEXT: cvt.f32.bf16 %r12, %rs1;
760
- ; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
761
- ; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
752
+ ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
753
+ ; SM80-NEXT: cvt.f32.bf16 %r5, %rs2;
754
+ ; SM80-NEXT: cvt.f32.bf16 %r6, %rs1;
755
+ ; SM80-NEXT: mov.b64 %rd2, {%r6, %r5};
756
+ ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
757
+ ; SM80-NEXT: cvt.f32.bf16 %r7, %rs4;
758
+ ; SM80-NEXT: cvt.f32.bf16 %r8, %rs3;
759
+ ; SM80-NEXT: mov.b64 %rd3, {%r8, %r7};
760
+ ; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
761
+ ; SM80-NEXT: cvt.f32.bf16 %r9, %rs6;
762
+ ; SM80-NEXT: cvt.f32.bf16 %r10, %rs5;
763
+ ; SM80-NEXT: mov.b64 %rd4, {%r10, %r9};
764
+ ; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
765
+ ; SM80-NEXT: cvt.f32.bf16 %r11, %rs8;
766
+ ; SM80-NEXT: cvt.f32.bf16 %r12, %rs7;
767
+ ; SM80-NEXT: mov.b64 %rd5, {%r12, %r11};
768
+ ; SM80-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
769
+ ; SM80-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
762
770
; SM80-NEXT: ret;
763
771
;
764
772
; SM80-FTZ-LABEL: test_extload_bf16x8(
765
773
; SM80-FTZ: {
766
774
; SM80-FTZ-NEXT: .reg .b16 %rs<9>;
767
775
; SM80-FTZ-NEXT: .reg .b32 %r<13>;
768
- ; SM80-FTZ-NEXT: .reg .b64 %rd<2 >;
776
+ ; SM80-FTZ-NEXT: .reg .b64 %rd<6 >;
769
777
; SM80-FTZ-EMPTY:
770
778
; SM80-FTZ-NEXT: // %bb.0:
771
779
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
772
780
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
773
- ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
774
- ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
775
- ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
776
- ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
777
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
778
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
779
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
780
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
781
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
782
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
783
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
784
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
785
- ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
786
- ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
781
+ ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
782
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
783
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1;
784
+ ; SM80-FTZ-NEXT: mov.b64 %rd2, {%r6, %r5};
785
+ ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
786
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4;
787
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3;
788
+ ; SM80-FTZ-NEXT: mov.b64 %rd3, {%r8, %r7};
789
+ ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
790
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6;
791
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5;
792
+ ; SM80-FTZ-NEXT: mov.b64 %rd4, {%r10, %r9};
793
+ ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
794
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8;
795
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7;
796
+ ; SM80-FTZ-NEXT: mov.b64 %rd5, {%r12, %r11};
797
+ ; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
798
+ ; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
787
799
; SM80-FTZ-NEXT: ret;
788
800
;
789
801
; SM90-LABEL: test_extload_bf16x8(
790
802
; SM90: {
791
803
; SM90-NEXT: .reg .b16 %rs<9>;
792
804
; SM90-NEXT: .reg .b32 %r<13>;
793
- ; SM90-NEXT: .reg .b64 %rd<2 >;
805
+ ; SM90-NEXT: .reg .b64 %rd<6 >;
794
806
; SM90-EMPTY:
795
807
; SM90-NEXT: // %bb.0:
796
808
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
797
809
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
798
- ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
799
- ; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
800
- ; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
801
- ; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
802
- ; SM90-NEXT: cvt.f32.bf16 %r5, %rs8;
803
- ; SM90-NEXT: cvt.f32.bf16 %r6, %rs7;
804
- ; SM90-NEXT: cvt.f32.bf16 %r7, %rs6;
805
- ; SM90-NEXT: cvt.f32.bf16 %r8, %rs5;
806
- ; SM90-NEXT: cvt.f32.bf16 %r9, %rs4;
807
- ; SM90-NEXT: cvt.f32.bf16 %r10, %rs3;
808
- ; SM90-NEXT: cvt.f32.bf16 %r11, %rs2;
809
- ; SM90-NEXT: cvt.f32.bf16 %r12, %rs1;
810
- ; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
811
- ; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
810
+ ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
811
+ ; SM90-NEXT: cvt.f32.bf16 %r5, %rs2;
812
+ ; SM90-NEXT: cvt.f32.bf16 %r6, %rs1;
813
+ ; SM90-NEXT: mov.b64 %rd2, {%r6, %r5};
814
+ ; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
815
+ ; SM90-NEXT: cvt.f32.bf16 %r7, %rs4;
816
+ ; SM90-NEXT: cvt.f32.bf16 %r8, %rs3;
817
+ ; SM90-NEXT: mov.b64 %rd3, {%r8, %r7};
818
+ ; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
819
+ ; SM90-NEXT: cvt.f32.bf16 %r9, %rs6;
820
+ ; SM90-NEXT: cvt.f32.bf16 %r10, %rs5;
821
+ ; SM90-NEXT: mov.b64 %rd4, {%r10, %r9};
822
+ ; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
823
+ ; SM90-NEXT: cvt.f32.bf16 %r11, %rs8;
824
+ ; SM90-NEXT: cvt.f32.bf16 %r12, %rs7;
825
+ ; SM90-NEXT: mov.b64 %rd5, {%r12, %r11};
826
+ ; SM90-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4};
827
+ ; SM90-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2};
812
828
; SM90-NEXT: ret;
813
829
%load = load <8 x bfloat>, ptr addrspace (3 ) %arg , align 16
814
830
%res = fpext <8 x bfloat> %load to <8 x float >
0 commit comments