Skip to content

Commit c07d77f

Browse files
authored
Merge pull request #7957 from bosilca/fix/avx_alignment
Use the unaligned SSE memory access primitive.
2 parents e5ef80f + c4e88a4 commit c07d77f

File tree

2 files changed

+363
-1080
lines changed

2 files changed

+363
-1080
lines changed

ompi/mca/op/avx/op_avx_functions.c

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
286286
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
287287
types_per_step = (512 / 8) / sizeof(float); \
288288
for (; left_over >= types_per_step; left_over -= types_per_step) { \
289-
__m512 vecA = _mm512_load_ps((__m512*)in); \
290-
__m512 vecB = _mm512_load_ps((__m512*)out); \
289+
__m512 vecA = _mm512_loadu_ps((__m512*)in); \
290+
__m512 vecB = _mm512_loadu_ps((__m512*)out); \
291291
in += types_per_step; \
292292
__m512 res = _mm512_##op##_ps(vecA, vecB); \
293-
_mm512_store_ps((__m512*)out, res); \
293+
_mm512_storeu_ps((__m512*)out, res); \
294294
out += types_per_step; \
295295
} \
296296
if( 0 == left_over ) return; \
@@ -304,11 +304,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
304304
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
305305
types_per_step = (256 / 8) / sizeof(float); \
306306
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
307-
__m256 vecA = _mm256_load_ps(in); \
307+
__m256 vecA = _mm256_loadu_ps(in); \
308308
in += types_per_step; \
309-
__m256 vecB = _mm256_load_ps(out); \
309+
__m256 vecB = _mm256_loadu_ps(out); \
310310
__m256 res = _mm256_##op##_ps(vecA, vecB); \
311-
_mm256_store_ps(out, res); \
311+
_mm256_storeu_ps(out, res); \
312312
out += types_per_step; \
313313
} \
314314
if( 0 == left_over ) return; \
@@ -322,11 +322,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
322322
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
323323
types_per_step = (128 / 8) / sizeof(float); \
324324
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
325-
__m128 vecA = _mm_load_ps(in); \
325+
__m128 vecA = _mm_loadu_ps(in); \
326326
in += types_per_step; \
327-
__m128 vecB = _mm_load_ps(out); \
327+
__m128 vecB = _mm_loadu_ps(out); \
328328
__m128 res = _mm_##op##_ps(vecA, vecB); \
329-
_mm_store_ps(out, res); \
329+
_mm_storeu_ps(out, res); \
330330
out += types_per_step; \
331331
} \
332332
}
@@ -367,11 +367,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
367367
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
368368
types_per_step = (512 / 8) / sizeof(double); \
369369
for (; left_over >= types_per_step; left_over -= types_per_step) { \
370-
__m512d vecA = _mm512_load_pd(in); \
370+
__m512d vecA = _mm512_loadu_pd(in); \
371371
in += types_per_step; \
372-
__m512d vecB = _mm512_load_pd(out); \
372+
__m512d vecB = _mm512_loadu_pd(out); \
373373
__m512d res = _mm512_##op##_pd(vecA, vecB); \
374-
_mm512_store_pd((out), res); \
374+
_mm512_storeu_pd((out), res); \
375375
out += types_per_step; \
376376
} \
377377
if( 0 == left_over ) return; \
@@ -385,11 +385,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
385385
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
386386
types_per_step = (256 / 8) / sizeof(double); \
387387
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
388-
__m256d vecA = _mm256_load_pd(in); \
388+
__m256d vecA = _mm256_loadu_pd(in); \
389389
in += types_per_step; \
390-
__m256d vecB = _mm256_load_pd(out); \
390+
__m256d vecB = _mm256_loadu_pd(out); \
391391
__m256d res = _mm256_##op##_pd(vecA, vecB); \
392-
_mm256_store_pd(out, res); \
392+
_mm256_storeu_pd(out, res); \
393393
out += types_per_step; \
394394
} \
395395
if( 0 == left_over ) return; \
@@ -403,11 +403,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
403403
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
404404
types_per_step = (128 / 8) / sizeof(double); \
405405
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
406-
__m128d vecA = _mm_load_pd(in); \
406+
__m128d vecA = _mm_loadu_pd(in); \
407407
in += types_per_step; \
408-
__m128d vecB = _mm_load_pd(out); \
408+
__m128d vecB = _mm_loadu_pd(out); \
409409
__m128d res = _mm_##op##_pd(vecA, vecB); \
410-
_mm_store_pd(out, res); \
410+
_mm_storeu_pd(out, res); \
411411
out += types_per_step; \
412412
} \
413413
}
@@ -813,12 +813,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
813813
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
814814
types_per_step = (512 / 8) / sizeof(float); \
815815
for (; left_over >= types_per_step; left_over -= types_per_step) { \
816-
__m512 vecA = _mm512_load_ps(in1); \
817-
__m512 vecB = _mm512_load_ps(in2); \
816+
__m512 vecA = _mm512_loadu_ps(in1); \
817+
__m512 vecB = _mm512_loadu_ps(in2); \
818818
in1 += types_per_step; \
819819
in2 += types_per_step; \
820820
__m512 res = _mm512_##op##_ps(vecA, vecB); \
821-
_mm512_store_ps(out, res); \
821+
_mm512_storeu_ps(out, res); \
822822
out += types_per_step; \
823823
} \
824824
if( 0 == left_over ) return; \
@@ -832,12 +832,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
832832
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
833833
types_per_step = (256 / 8) / sizeof(float); \
834834
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
835-
__m256 vecA = _mm256_load_ps(in1); \
836-
__m256 vecB = _mm256_load_ps(in2); \
835+
__m256 vecA = _mm256_loadu_ps(in1); \
836+
__m256 vecB = _mm256_loadu_ps(in2); \
837837
in1 += types_per_step; \
838838
in2 += types_per_step; \
839839
__m256 res = _mm256_##op##_ps(vecA, vecB); \
840-
_mm256_store_ps(out, res); \
840+
_mm256_storeu_ps(out, res); \
841841
out += types_per_step; \
842842
} \
843843
if( 0 == left_over ) return; \
@@ -851,12 +851,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
851851
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
852852
types_per_step = (128 / 8) / sizeof(float); \
853853
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
854-
__m128 vecA = _mm_load_ps(in1); \
855-
__m128 vecB = _mm_load_ps(in2); \
854+
__m128 vecA = _mm_loadu_ps(in1); \
855+
__m128 vecB = _mm_loadu_ps(in2); \
856856
in1 += types_per_step; \
857857
in2 += types_per_step; \
858858
__m128 res = _mm_##op##_ps(vecA, vecB); \
859-
_mm_store_ps(out, res); \
859+
_mm_storeu_ps(out, res); \
860860
out += types_per_step; \
861861
} \
862862
}
@@ -899,12 +899,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
899899
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
900900
types_per_step = (512 / 8) / sizeof(double); \
901901
for (; left_over >= types_per_step; left_over -= types_per_step) { \
902-
__m512d vecA = _mm512_load_pd((in1)); \
903-
__m512d vecB = _mm512_load_pd((in2)); \
902+
__m512d vecA = _mm512_loadu_pd((in1)); \
903+
__m512d vecB = _mm512_loadu_pd((in2)); \
904904
in1 += types_per_step; \
905905
in2 += types_per_step; \
906906
__m512d res = _mm512_##op##_pd(vecA, vecB); \
907-
_mm512_store_pd((out), res); \
907+
_mm512_storeu_pd((out), res); \
908908
out += types_per_step; \
909909
} \
910910
if( 0 == left_over ) return; \
@@ -918,12 +918,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
918918
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
919919
types_per_step = (256 / 8) / sizeof(double); \
920920
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
921-
__m256d vecA = _mm256_load_pd(in1); \
922-
__m256d vecB = _mm256_load_pd(in2); \
921+
__m256d vecA = _mm256_loadu_pd(in1); \
922+
__m256d vecB = _mm256_loadu_pd(in2); \
923923
in1 += types_per_step; \
924924
in2 += types_per_step; \
925925
__m256d res = _mm256_##op##_pd(vecA, vecB); \
926-
_mm256_store_pd(out, res); \
926+
_mm256_storeu_pd(out, res); \
927927
out += types_per_step; \
928928
} \
929929
if( 0 == left_over ) return; \
@@ -937,12 +937,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
937937
if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
938938
types_per_step = (128 / 8) / sizeof(double); \
939939
for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
940-
__m128d vecA = _mm_load_pd(in1); \
941-
__m128d vecB = _mm_load_pd(in2); \
940+
__m128d vecA = _mm_loadu_pd(in1); \
941+
__m128d vecB = _mm_loadu_pd(in2); \
942942
in1 += types_per_step; \
943943
in2 += types_per_step; \
944944
__m128d res = _mm_##op##_pd(vecA, vecB); \
945-
_mm_store_pd(out, res); \
945+
_mm_storeu_pd(out, res); \
946946
out += types_per_step; \
947947
} \
948948
}

0 commit comments

Comments
 (0)