@@ -286,11 +286,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
286
286
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
287
287
types_per_step = (512 / 8) / sizeof(float); \
288
288
for (; left_over >= types_per_step; left_over -= types_per_step) { \
289
- __m512 vecA = _mm512_load_ps ((__m512*)in); \
290
- __m512 vecB = _mm512_load_ps ((__m512*)out); \
289
+ __m512 vecA = _mm512_loadu_ps ((__m512*)in); \
290
+ __m512 vecB = _mm512_loadu_ps ((__m512*)out); \
291
291
in += types_per_step; \
292
292
__m512 res = _mm512_##op##_ps(vecA, vecB); \
293
- _mm512_store_ps ((__m512*)out, res); \
293
+ _mm512_storeu_ps ((__m512*)out, res); \
294
294
out += types_per_step; \
295
295
} \
296
296
if( 0 == left_over ) return; \
@@ -304,11 +304,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
304
304
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
305
305
types_per_step = (256 / 8) / sizeof(float); \
306
306
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
307
- __m256 vecA = _mm256_load_ps (in); \
307
+ __m256 vecA = _mm256_loadu_ps (in); \
308
308
in += types_per_step; \
309
- __m256 vecB = _mm256_load_ps (out); \
309
+ __m256 vecB = _mm256_loadu_ps (out); \
310
310
__m256 res = _mm256_##op##_ps(vecA, vecB); \
311
- _mm256_store_ps (out, res); \
311
+ _mm256_storeu_ps (out, res); \
312
312
out += types_per_step; \
313
313
} \
314
314
if( 0 == left_over ) return; \
@@ -322,11 +322,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
322
322
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
323
323
types_per_step = (128 / 8) / sizeof(float); \
324
324
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
325
- __m128 vecA = _mm_load_ps (in); \
325
+ __m128 vecA = _mm_loadu_ps (in); \
326
326
in += types_per_step; \
327
- __m128 vecB = _mm_load_ps (out); \
327
+ __m128 vecB = _mm_loadu_ps (out); \
328
328
__m128 res = _mm_##op##_ps(vecA, vecB); \
329
- _mm_store_ps (out, res); \
329
+ _mm_storeu_ps (out, res); \
330
330
out += types_per_step; \
331
331
} \
332
332
}
@@ -367,11 +367,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
367
367
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
368
368
types_per_step = (512 / 8) / sizeof(double); \
369
369
for (; left_over >= types_per_step; left_over -= types_per_step) { \
370
- __m512d vecA = _mm512_load_pd (in); \
370
+ __m512d vecA = _mm512_loadu_pd (in); \
371
371
in += types_per_step; \
372
- __m512d vecB = _mm512_load_pd (out); \
372
+ __m512d vecB = _mm512_loadu_pd (out); \
373
373
__m512d res = _mm512_##op##_pd(vecA, vecB); \
374
- _mm512_store_pd ((out), res); \
374
+ _mm512_storeu_pd ((out), res); \
375
375
out += types_per_step; \
376
376
} \
377
377
if( 0 == left_over ) return; \
@@ -385,11 +385,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
385
385
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
386
386
types_per_step = (256 / 8) / sizeof(double); \
387
387
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
388
- __m256d vecA = _mm256_load_pd (in); \
388
+ __m256d vecA = _mm256_loadu_pd (in); \
389
389
in += types_per_step; \
390
- __m256d vecB = _mm256_load_pd (out); \
390
+ __m256d vecB = _mm256_loadu_pd (out); \
391
391
__m256d res = _mm256_##op##_pd(vecA, vecB); \
392
- _mm256_store_pd (out, res); \
392
+ _mm256_storeu_pd (out, res); \
393
393
out += types_per_step; \
394
394
} \
395
395
if( 0 == left_over ) return; \
@@ -403,11 +403,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
403
403
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
404
404
types_per_step = (128 / 8) / sizeof(double); \
405
405
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
406
- __m128d vecA = _mm_load_pd (in); \
406
+ __m128d vecA = _mm_loadu_pd (in); \
407
407
in += types_per_step; \
408
- __m128d vecB = _mm_load_pd (out); \
408
+ __m128d vecB = _mm_loadu_pd (out); \
409
409
__m128d res = _mm_##op##_pd(vecA, vecB); \
410
- _mm_store_pd (out, res); \
410
+ _mm_storeu_pd (out, res); \
411
411
out += types_per_step; \
412
412
} \
413
413
}
@@ -813,12 +813,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
813
813
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
814
814
types_per_step = (512 / 8) / sizeof(float); \
815
815
for (; left_over >= types_per_step; left_over -= types_per_step) { \
816
- __m512 vecA = _mm512_load_ps (in1); \
817
- __m512 vecB = _mm512_load_ps (in2); \
816
+ __m512 vecA = _mm512_loadu_ps (in1); \
817
+ __m512 vecB = _mm512_loadu_ps (in2); \
818
818
in1 += types_per_step; \
819
819
in2 += types_per_step; \
820
820
__m512 res = _mm512_##op##_ps(vecA, vecB); \
821
- _mm512_store_ps (out, res); \
821
+ _mm512_storeu_ps (out, res); \
822
822
out += types_per_step; \
823
823
} \
824
824
if( 0 == left_over ) return; \
@@ -832,12 +832,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
832
832
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
833
833
types_per_step = (256 / 8) / sizeof(float); \
834
834
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
835
- __m256 vecA = _mm256_load_ps (in1); \
836
- __m256 vecB = _mm256_load_ps (in2); \
835
+ __m256 vecA = _mm256_loadu_ps (in1); \
836
+ __m256 vecB = _mm256_loadu_ps (in2); \
837
837
in1 += types_per_step; \
838
838
in2 += types_per_step; \
839
839
__m256 res = _mm256_##op##_ps(vecA, vecB); \
840
- _mm256_store_ps (out, res); \
840
+ _mm256_storeu_ps (out, res); \
841
841
out += types_per_step; \
842
842
} \
843
843
if( 0 == left_over ) return; \
@@ -851,12 +851,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
851
851
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
852
852
types_per_step = (128 / 8) / sizeof(float); \
853
853
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
854
- __m128 vecA = _mm_load_ps (in1); \
855
- __m128 vecB = _mm_load_ps (in2); \
854
+ __m128 vecA = _mm_loadu_ps (in1); \
855
+ __m128 vecB = _mm_loadu_ps (in2); \
856
856
in1 += types_per_step; \
857
857
in2 += types_per_step; \
858
858
__m128 res = _mm_##op##_ps(vecA, vecB); \
859
- _mm_store_ps (out, res); \
859
+ _mm_storeu_ps (out, res); \
860
860
out += types_per_step; \
861
861
} \
862
862
}
@@ -899,12 +899,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
899
899
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
900
900
types_per_step = (512 / 8) / sizeof(double); \
901
901
for (; left_over >= types_per_step; left_over -= types_per_step) { \
902
- __m512d vecA = _mm512_load_pd ((in1)); \
903
- __m512d vecB = _mm512_load_pd ((in2)); \
902
+ __m512d vecA = _mm512_loadu_pd ((in1)); \
903
+ __m512d vecB = _mm512_loadu_pd ((in2)); \
904
904
in1 += types_per_step; \
905
905
in2 += types_per_step; \
906
906
__m512d res = _mm512_##op##_pd(vecA, vecB); \
907
- _mm512_store_pd ((out), res); \
907
+ _mm512_storeu_pd ((out), res); \
908
908
out += types_per_step; \
909
909
} \
910
910
if( 0 == left_over ) return; \
@@ -918,12 +918,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
918
918
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
919
919
types_per_step = (256 / 8) / sizeof(double); \
920
920
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
921
- __m256d vecA = _mm256_load_pd (in1); \
922
- __m256d vecB = _mm256_load_pd (in2); \
921
+ __m256d vecA = _mm256_loadu_pd (in1); \
922
+ __m256d vecB = _mm256_loadu_pd (in2); \
923
923
in1 += types_per_step; \
924
924
in2 += types_per_step; \
925
925
__m256d res = _mm256_##op##_pd(vecA, vecB); \
926
- _mm256_store_pd (out, res); \
926
+ _mm256_storeu_pd (out, res); \
927
927
out += types_per_step; \
928
928
} \
929
929
if( 0 == left_over ) return; \
@@ -937,12 +937,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
937
937
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
938
938
types_per_step = (128 / 8) / sizeof(double); \
939
939
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
940
- __m128d vecA = _mm_load_pd (in1); \
941
- __m128d vecB = _mm_load_pd (in2); \
940
+ __m128d vecA = _mm_loadu_pd (in1); \
941
+ __m128d vecB = _mm_loadu_pd (in2); \
942
942
in1 += types_per_step; \
943
943
in2 += types_per_step; \
944
944
__m128d res = _mm_##op##_pd(vecA, vecB); \
945
- _mm_store_pd (out, res); \
945
+ _mm_storeu_pd (out, res); \
946
946
out += types_per_step; \
947
947
} \
948
948
}
0 commit comments