@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479
479
return ret ;
480
480
}
481
481
482
+ #pragma push_macro("__INTRINSIC_LOAD")
483
+ #define __INTRINSIC_LOAD (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
484
+ __Clobber ) \
485
+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
486
+ __TmpType __ret; \
487
+ asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
488
+ return (__DeclType)__ret; \
489
+ }
490
+
491
+ #pragma push_macro("__INTRINSIC_LOAD2")
492
+ #define __INTRINSIC_LOAD2 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
493
+ __Clobber ) \
494
+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
495
+ __DeclType __ret; \
496
+ __TmpType __tmp; \
497
+ asm(__AsmOp " {%0,%1}, [%2];" \
498
+ : __AsmType(__tmp.x), __AsmType(__tmp.y) \
499
+ : "l"(__ptr)__Clobber); \
500
+ using __ElementType = decltype(__ret.x); \
501
+ __ret.x = (__ElementType)(__tmp.x); \
502
+ __ret.y = (__ElementType)__tmp.y; \
503
+ return __ret; \
504
+ }
505
+
506
+ #pragma push_macro("__INTRINSIC_LOAD4")
507
+ #define __INTRINSIC_LOAD4 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
508
+ __Clobber ) \
509
+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
510
+ __DeclType __ret; \
511
+ __TmpType __tmp; \
512
+ asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
513
+ : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
514
+ __AsmType(__tmp.w) \
515
+ : "l"(__ptr)__Clobber); \
516
+ using __ElementType = decltype(__ret.x); \
517
+ __ret.x = (__ElementType)__tmp.x; \
518
+ __ret.y = (__ElementType)__tmp.y; \
519
+ __ret.z = (__ElementType)__tmp.z; \
520
+ __ret.w = (__ElementType)__tmp.w; \
521
+ return __ret; \
522
+ }
523
+
524
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" , );
525
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" , );
526
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" , );
527
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" , );
528
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long,
529
+ "=l" , );
530
+
531
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" , );
532
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" , );
533
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" , );
534
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" , );
535
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" , );
536
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" , );
537
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" , );
538
+
539
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int ,
540
+ "=r" , );
541
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short ,
542
+ "=h" , );
543
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int ,
544
+ "=r" , );
545
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long,
546
+ unsigned long long, "=l" , );
547
+
548
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" , );
549
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" , );
550
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" , );
551
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" , );
552
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" , );
553
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" , );
554
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 ,
555
+ "=l" , );
556
+
557
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" , );
558
+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" , );
559
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" , );
560
+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" , );
561
+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" , );
562
+
563
+ inline __device__ long __ldcg (const long * __ptr ) {
564
+ unsigned long __ret ;
565
+ if (sizeof (long ) == 8 ) {
566
+ asm("ld.global.cg.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
567
+ } else {
568
+ asm("ld.global.cg.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
569
+ }
570
+ return (long )__ret ;
571
+ }
572
+
573
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int ,
574
+ "=r" , : "memory" );
575
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
576
+ "=h" , : "memory" );
577
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int ,
578
+ "=r" , : "memory" );
579
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
580
+ unsigned long long, "=l" , : "memory" );
581
+
582
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , char , unsigned int ,
583
+ "=r" , : "memory" );
584
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , signed char , unsigned int ,
585
+ "=r" , : "memory" );
586
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s16" , short , unsigned short ,
587
+ "=h" , : "memory" );
588
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s32" , int , unsigned int ,
589
+ "=r" , : "memory" );
590
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s64" , long long , unsigned long long,
591
+ "=l" , : "memory" );
592
+
593
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 ,
594
+ "=r" , : "memory" );
595
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 ,
596
+ "=r" , : "memory" );
597
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 ,
598
+ "=h" , : "memory" );
599
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 ,
600
+ "=h" , : "memory" );
601
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 ,
602
+ "=r" , : "memory" );
603
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 ,
604
+ "=r" , : "memory" );
605
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 ,
606
+ "=l" , : "memory" );
607
+
608
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s8" , char2 , int2 , "=r" , : "memory" );
609
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s8" , char4 , int4 , "=r" , : "memory" );
610
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s16" , short2 , short2 ,
611
+ "=h" , : "memory" );
612
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s16" , short4 , short4 ,
613
+ "=h" , : "memory" );
614
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s32" , int2 , int2 , "=r" , : "memory" );
615
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s32" , int4 , int4 , "=r" , : "memory" );
616
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s64" , longlong2 , longlong2 ,
617
+ "=l" , : "memory" );
618
+
619
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" , : "memory" );
620
+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" , : "memory" );
621
+
622
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 ,
623
+ "=f" , : "memory" );
624
+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 ,
625
+ "=f" , : "memory" );
626
+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 ,
627
+ "=d" , : "memory" );
628
+
629
+ inline __device__ long __ldcv (const long * __ptr ) {
630
+ unsigned long __ret ;
631
+ if (sizeof (long ) == 8 ) {
632
+ asm("ld.global.cv.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
633
+ } else {
634
+ asm("ld.global.cv.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
635
+ }
636
+ return (long )__ret ;
637
+ }
638
+
639
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" , );
640
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" , );
641
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" , );
642
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" , );
643
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long,
644
+ "=l" , );
645
+
646
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" , );
647
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" , );
648
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" , );
649
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" , );
650
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" , );
651
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" , );
652
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" , );
653
+
654
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int ,
655
+ "=r" , );
656
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
657
+ "=h" , );
658
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int ,
659
+ "=r" , );
660
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
661
+ unsigned long long, "=l" , );
662
+
663
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" , );
664
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" , );
665
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" , );
666
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" , );
667
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" , );
668
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" , );
669
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 ,
670
+ "=l" , );
671
+
672
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" , );
673
+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" , );
674
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" , );
675
+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" , );
676
+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" , );
677
+
678
+ #pragma pop_macro("__INTRINSIC_LOAD")
679
+ #pragma pop_macro("__INTRINSIC_LOAD2")
680
+ #pragma pop_macro("__INTRINSIC_LOAD4")
681
+
682
+ inline __device__ long __ldcs (const long * __ptr ) {
683
+ unsigned long __ret ;
684
+ if (sizeof (long ) == 8 ) {
685
+ asm("ld.global.cs.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
686
+ } else {
687
+ asm("ld.global.cs.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
688
+ }
689
+ return (long )__ret ;
690
+ }
691
+
692
+ #pragma push_macro("__INTRINSIC_STORE")
693
+ #define __INTRINSIC_STORE (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType ) \
694
+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
695
+ __TmpType __tmp = (__TmpType)__value; \
696
+ asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
697
+ }
698
+
699
+ #pragma push_macro("__INTRINSIC_STORE2")
700
+ #define __INTRINSIC_STORE2 (__FnName , __AsmOp , __DeclType , __TmpType , \
701
+ __AsmType ) \
702
+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
703
+ __TmpType __tmp; \
704
+ using __ElementType = decltype(__tmp.x); \
705
+ __tmp.x = (__ElementType)(__value.x); \
706
+ __tmp.y = (__ElementType)(__value.y); \
707
+ asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
708
+ __AsmType(__tmp.y) \
709
+ : "memory"); \
710
+ }
711
+
712
+ #pragma push_macro("__INTRINSIC_STORE4")
713
+ #define __INTRINSIC_STORE4 (__FnName , __AsmOp , __DeclType , __TmpType , \
714
+ __AsmType ) \
715
+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
716
+ __TmpType __tmp; \
717
+ using __ElementType = decltype(__tmp.x); \
718
+ __tmp.x = (__ElementType)(__value.x); \
719
+ __tmp.y = (__ElementType)(__value.y); \
720
+ __tmp.z = (__ElementType)(__value.z); \
721
+ __tmp.w = (__ElementType)(__value.w); \
722
+ asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
723
+ __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
724
+ : "memory"); \
725
+ }
726
+
727
+ __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
728
+ __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
729
+ __INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
730
+ __INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
731
+ __INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
732
+
733
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
734
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
735
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
736
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
737
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
738
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
739
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
740
+
741
+ __INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
742
+ __INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
743
+ "h" );
744
+ __INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
745
+ __INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
746
+ unsigned long long, "l" );
747
+
748
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
749
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
750
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
751
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
752
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
753
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
754
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
755
+
756
+ __INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
757
+ __INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
758
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
759
+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
760
+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
761
+
762
+ #pragma pop_macro("__INTRINSIC_STORE")
763
+ #pragma pop_macro("__INTRINSIC_STORE2")
764
+ #pragma pop_macro("__INTRINSIC_STORE4")
765
+
482
766
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483
767
484
768
#if CUDA_VERSION >= 11000
0 commit comments