From c659362daef40159071ec9b02c7c8f2218cb61ca Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 12 Jul 2023 17:15:42 +0200 Subject: [PATCH] Enable use of `cudaMemcpyAsync` for `thrust::copy` In case of contigous ranges of trivially relocatable types we can directly utilize `cudaMemcpyAsync` instead of going through transform. Fixes #210 --- .../detail/internal/copy_device_to_device.h | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h index 69c4e20dfdf..a0d9d492dac 100644 --- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h @@ -33,12 +33,38 @@ #include #include #include +#include +#include #include +#include THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace __copy { + template + OutputIt THRUST_RUNTIME_FUNCTION + device_to_device(execution_policy& policy, + InputIt first, + InputIt last, + OutputIt result, + thrust::detail::true_type) + { + typedef typename thrust::iterator_traits::value_type InputTy; + const auto n = thrust::distance(first, last); + if (n > 0) { + cudaError status; + status = trivial_copy_device_to_device(policy, + reinterpret_cast(thrust::raw_pointer_cast(&*result)), + reinterpret_cast(thrust::raw_pointer_cast(&*first)), + n); + cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); + } + + return result + n; + } template & policy, InputIt first, InputIt last, - OutputIt result) + OutputIt result, + thrust::detail::false_type) { typedef typename thrust::iterator_traits::value_type InputTy; return cuda_cub::transform(policy, + first, + last, + result, + thrust::identity()); + } + + template + OutputIt THRUST_RUNTIME_FUNCTION + device_to_device(execution_policy& policy, + InputIt first, + InputIt last, + OutputIt result) + { + return device_to_device(policy, first, last, result, - thrust::identity()); + typename is_indirectly_trivially_relocatable_to::type()); } - } // namespace __copy } // namespace cuda_cub