Skip to content

Commit de6e848

Browse files
authored
[SYCL][ROCm] memsetBufferFill for patterns greater than 4 bytes (#4252)
Adds case for applying greater than 4 byte patterns to memory buffer. First applies 4 byte memset pass and follows up with strided 2D memset passes for the remaining pattern segments. The first initial 4 byte pass provides a computational speed up of between 21%-49% depending on the buffer and pattern size, compared to just using the strided 2D memset for all pattern segments.
1 parent af06e39 commit de6e848

File tree

1 file changed

+33
-2
lines changed

1 file changed

+33
-2
lines changed

sycl/plugins/rocm/pi_rocm.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3751,7 +3751,8 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
37513751
result = retImplEv->start();
37523752
}
37533753

3754-
auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset);
3754+
auto dstDevice =
3755+
(uint8_t *)buffer->mem_.buffer_mem_.get_with_offset(offset);
37553756
auto stream = command_queue->get();
37563757
auto N = size / pattern_size;
37573758

@@ -3774,7 +3775,37 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
37743775
}
37753776

37763777
default: {
3777-
result = PI_INVALID_VALUE;
3778+
// HIP has no memset functions that allow setting values more than 4
3779+
// bytes. PI API lets you pass an arbitrary "pattern" to the buffer
3780+
// fill, which can be more than 4 bytes. We must break up the pattern
3781+
// into 1 byte values, and set the buffer using multiple strided calls.
3782+
// The first 4 patterns are set using hipMemsetD32Async then all
3783+
// subsequent 1 byte patterns are set using hipMemset2DAsync which is
3784+
// called for each pattern.
3785+
3786+
// Calculate the number of patterns, stride, number of times the pattern
3787+
// needs to be applied, and the number of times the first 32 bit pattern
3788+
// needs to be applied.
3789+
auto number_of_steps = pattern_size / sizeof(uint8_t);
3790+
auto pitch = number_of_steps * sizeof(uint8_t);
3791+
auto height = size / number_of_steps;
3792+
auto count_32 = size / sizeof(uint32_t);
3793+
3794+
// Get 4-byte chunk of the pattern and call hipMemsetD32Async
3795+
auto value = *(static_cast<const uint32_t *>(pattern));
3796+
result =
3797+
PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream));
3798+
for (auto step = 4u; step < number_of_steps; ++step) {
3799+
// take 1 byte of the pattern
3800+
value = *(static_cast<const uint8_t *>(pattern) + step);
3801+
3802+
// offset the pointer to the part of the buffer we want to write to
3803+
auto offset_ptr = dstDevice + (step * sizeof(uint8_t));
3804+
3805+
// set all of the pattern chunks
3806+
result = PI_CHECK_ERROR(hipMemset2DAsync(
3807+
offset_ptr, pitch, value, sizeof(uint8_t), height, stream));
3808+
}
37783809
break;
37793810
}
37803811
}

0 commit comments

Comments
 (0)