File tree Expand file tree Collapse file tree 1 file changed +12
-2
lines changed
onnxruntime/core/providers/cuda/tensor Expand file tree Collapse file tree 1 file changed +12
-2
lines changed Original file line number Diff line number Diff line change @@ -36,9 +36,9 @@ typename ToCudaType<T>::MappedType ToCudaValue(const T& value) {
36
36
return value;
37
37
}
38
38
39
- template <>
39
+ template <>
40
40
typename ToCudaType<MLFloat16>::MappedType ToCudaValue<MLFloat16>(const MLFloat16& value) {
41
- return *reinterpret_cast <const typename ToCudaType<MLFloat16>::MappedType *>(&value.val );
41
+ return *reinterpret_cast <const typename ToCudaType<MLFloat16>::MappedType*>(&value.val );
42
42
}
43
43
44
44
template <typename T>
@@ -120,6 +120,16 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
120
120
}
121
121
122
122
auto & output_tensor = *ctx->Output (0 , output_shape);
123
+ if (std::all_of (p_pads->begin (), p_pads->end (), [](const int64_t v) { return v == 0 ; }) &&
124
+ std::all_of (p_slices->begin (), p_slices->end (), [](const int64_t v) { return v == 0 ; }) &&
125
+ output_shape.Size () > 0 ) {
126
+ CUDA_RETURN_IF_ERROR (cudaMemcpyAsync (
127
+ output_tensor.template MutableData <T>(), input_tensor.template Data <T>(),
128
+ sizeof (typename ToCudaType<T>::MappedType) * output_shape.Size (),
129
+ cudaMemcpyDeviceToDevice, 0 ));
130
+ return Status::OK ();
131
+ }
132
+
123
133
ORT_ENFORCE (CalculateFdmStrides (fdm_output_strides.CpuSpan (), output_dims));
124
134
ORT_RETURN_IF_ERROR (input_dims.CopyToGpu ());
125
135
ORT_RETURN_IF_ERROR (input_strides.CopyToGpu ());
You can’t perform that action at this time.
0 commit comments