|
4 | 4 | * reserved.
|
5 | 5 | * Copyright (c) 2019 Arm Ltd. All rights reserved.
|
6 | 6 | * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
|
| 7 | + * Copyright (c) 2024 Research Organization for Information Science |
| 8 | + * and Technology (RIST). All rights reserved. |
7 | 9 | *
|
8 | 10 | * $COPYRIGHT$
|
9 | 11 | *
|
@@ -140,20 +142,18 @@ _Generic((*(out)), \
|
140 | 142 | struct ompi_datatype_t **dtype, \
|
141 | 143 | struct ompi_op_base_module_1_0_0_t *module) \
|
142 | 144 | { \
|
143 |
| - int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ |
144 |
| - size_t idx = 0, left_over = *count; \ |
| 145 | + const int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ |
| 146 | + const int cnt = *count; \ |
145 | 147 | type##type_size##_t *in = (type##type_size##_t *) _in, \
|
146 | 148 | *out = (type##type_size##_t *) _out; \
|
147 | 149 | OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
|
148 |
| - svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ |
149 |
| - do { \ |
| 150 | + for (int idx=0; idx < cnt; idx += types_per_step) { \ |
| 151 | + svbool_t pred = svwhilelt_b##type_size(idx, cnt); \ |
150 | 152 | vsrc = svld1(pred, &in[idx]); \
|
151 | 153 | vdst = svld1(pred, &out[idx]); \
|
152 | 154 | vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
|
153 | 155 | OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
|
154 |
| - idx += types_per_step; \ |
155 |
| - pred = svwhilelt_b##type_size(idx, left_over); \ |
156 |
| - } while (svptest_any(svptrue_b##type_size(), pred)); \ |
| 156 | + } \ |
157 | 157 | }
|
158 | 158 | #endif
|
159 | 159 |
|
@@ -308,21 +308,19 @@ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPE
|
308 | 308 | struct ompi_datatype_t **dtype, \
|
309 | 309 | struct ompi_op_base_module_1_0_0_t *module) \
|
310 | 310 | { \
|
311 |
| - int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ |
| 311 | + const int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ |
312 | 312 | type##type_size##_t *in1 = (type##type_size##_t *) _in1, \
|
313 | 313 | *in2 = (type##type_size##_t *) _in2, \
|
314 | 314 | *out = (type##type_size##_t *) _out; \
|
315 |
| - size_t idx = 0, left_over = *count; \ |
| 315 | + const int cnt = *count; \ |
316 | 316 | OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
|
317 |
| - svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ |
318 |
| - do { \ |
| 317 | + for (int idx=0; idx < cnt; idx += types_per_step) { \ |
| 318 | + svbool_t pred = svwhilelt_b##type_size(idx, cnt); \ |
319 | 319 | vsrc = svld1(pred, &in1[idx]); \
|
320 | 320 | vdst = svld1(pred, &in2[idx]); \
|
321 | 321 | vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
|
322 | 322 | OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
|
323 |
| - idx += types_per_step; \ |
324 |
| - pred = svwhilelt_b##type_size(idx, left_over); \ |
325 |
| - } while (svptest_any(svptrue_b##type_size(), pred)); \ |
| 323 | + } \ |
326 | 324 | }
|
327 | 325 | #endif /* defined(GENERATE_SVE_CODE) */
|
328 | 326 |
|
|
0 commit comments