Skip to content

Commit e31be23

Browse files
Cuda pad optimize when no padding is needed. (#2625)
* Shortcut cuda Pad() when no padding is needed.
1 parent 8631b70 commit e31be23

File tree

1 file changed

+12
-2
lines changed
  • onnxruntime/core/providers/cuda/tensor

1 file changed

+12
-2
lines changed

onnxruntime/core/providers/cuda/tensor/pad.cc

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ typename ToCudaType<T>::MappedType ToCudaValue(const T& value) {
3636
return value;
3737
}
3838

39-
template<>
39+
template <>
4040
typename ToCudaType<MLFloat16>::MappedType ToCudaValue<MLFloat16>(const MLFloat16& value) {
41-
return *reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType *>(&value.val);
41+
return *reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType*>(&value.val);
4242
}
4343

4444
template <typename T>
@@ -120,6 +120,16 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
120120
}
121121

122122
auto& output_tensor = *ctx->Output(0, output_shape);
123+
if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
124+
std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
125+
output_shape.Size() > 0) {
126+
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
127+
output_tensor.template MutableData<T>(), input_tensor.template Data<T>(),
128+
sizeof(typename ToCudaType<T>::MappedType) * output_shape.Size(),
129+
cudaMemcpyDeviceToDevice, 0));
130+
return Status::OK();
131+
}
132+
123133
ORT_ENFORCE(CalculateFdmStrides(fdm_output_strides.CpuSpan(), output_dims));
124134
ORT_RETURN_IF_ERROR(input_dims.CopyToGpu());
125135
ORT_RETURN_IF_ERROR(input_strides.CopyToGpu());

0 commit comments

Comments
 (0)