From 3403eb4005bdddd0c8b3b2b5e5a33ad87c0a1d13 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:08:41 +0200 Subject: [PATCH 1/9] Move `vsmem` helper into their own file --- .../device/dispatch/dispatch_merge_sort.cuh | 1 + .../device/dispatch/dispatch_select_if.cuh | 1 + .../dispatch/dispatch_unique_by_key.cuh | 1 + cub/cub/util_device.cuh | 196 -------------- cub/cub/util_vsmem.cuh | 255 ++++++++++++++++++ cub/test/catch2_test_vsmem.cu | 1 + libcudacxx/include/cuda/discard_memory | 1 - 7 files changed, 259 insertions(+), 197 deletions(-) create mode 100644 cub/cub/util_vsmem.cuh diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 47420576030..e687cf5b010 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index a22f6acd031..332671383f6 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -52,6 +52,7 @@ #include #include #include +#include #include diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 9531a11b746..f62d6d27537 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -48,6 +48,7 @@ #include #include #include +#include #include diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 8075444c526..7dc9fc1aa25 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -49,12 +49,10 @@ #include #include -#include #include // for backward compatibility #include -#include #include #include @@ -70,7 +68,6 @@ CUB_NAMESPACE_BEGIN namespace detail { - /** * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD` * configurations of a given policy. @@ -82,199 +79,6 @@ struct policy_wrapper_t : PolicyT static constexpr int BLOCK_THREADS = BLOCK_THREADS_; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; }; - -/** - * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel. - * - */ -struct vsmem_t -{ - void* gmem_ptr; -}; - -// The maximum amount of static shared memory available per thread block -// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB -static constexpr std::size_t max_smem_per_block = 48 * 1024; - -/** - * @brief Class template that helps to prevent exceeding the available shared memory per thread block. - * - * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual - * shared memory is needed. - */ -template -class vsmem_helper_impl -{ -private: - // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size` - static constexpr std::size_t line_size = 128; - - // The amount of shared memory or virtual shared memory required by the algorithm's agent - static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage); - - // Whether we need to allocate global memory-backed virtual shared memory - static constexpr bool needs_vsmem = required_smem > max_smem_per_block; - - // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory - static constexpr std::size_t padding_bytes = - (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size)); - -public: - // Type alias to be used for static temporary storage declaration within the algorithm's kernel - using static_temp_storage_t = cub::detail::conditional_t; - - // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes - static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0; - - /** - * @brief Used from within the device algorithm's kernel to get the temporary storage that can be - * passed to the agent, specialized for the case when we can use native shared memory as temporary - * storage. - */ - static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& - get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&) - { - return static_temp_storage; - } - - /** - * @brief Used from within the device algorithm's kernel to get the temporary storage that can be - * passed to the agent, specialized for the case when we can use native shared memory as temporary - * storage and taking a linear block id. - */ - static __device__ __forceinline__ typename AgentT::TempStorage& - get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t) - { - return static_temp_storage; - } - - /** - * @brief Used from within the device algorithm's kernel to get the temporary storage that can be - * passed to the agent, specialized for the case when we have to use global memory-backed - * virtual shared memory as temporary storage. - */ - static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& - get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem) - { - return *reinterpret_cast( - static_cast(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x)); - } - - /** - * @brief Used from within the device algorithm's kernel to get the temporary storage that can be - * passed to the agent, specialized for the case when we have to use global memory-backed - * virtual shared memory as temporary storage and taking a linear block id. - */ - static __device__ __forceinline__ typename AgentT::TempStorage& - get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id) - { - return *reinterpret_cast( - static_cast(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id)); - } - - /** - * @brief Hints to discard modified cache lines of the used virtual shared memory. - * modified cache lines. - * - * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is - * supposed to be reused after this function call. - */ - template ::type = 0> - static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) - { - return false; - } - - /** - * @brief Hints to discard modified cache lines of the used virtual shared memory. - * modified cache lines. - * - * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is - * supposed to be reused after this function call. - */ - template ::type = 0> - static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) - { - // Ensure all threads finished using temporary storage - CTA_SYNC(); - - const std::size_t linear_tid = threadIdx.x; - const std::size_t block_stride = line_size * blockDim.x; - - char* ptr = reinterpret_cast(&temp_storage); - auto ptr_end = ptr + vsmem_per_block; - - // 128 byte-aligned virtual shared memory discard - for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride) - { - cuda::discard_memory(thread_ptr, line_size); - } - - return true; - } -}; - -template -constexpr bool use_fallback_agent() -{ - return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block) - && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block); -} - -/** - * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures: - * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check - * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory. - * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by - * global memory. - * - * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements - * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed - * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy - * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by - * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory - * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback - * agent falls within the bounds of `max_smem_per_block`. - * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy - */ -template ()> -struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl -{ - using agent_t = DefaultAgentT; - using agent_policy_t = DefaultAgentPolicyT; -}; -template -struct vsmem_helper_with_fallback_impl - : public vsmem_helper_impl -{ - using agent_t = FallbackAgentT; - using agent_policy_t = FallbackAgentPolicyT; -}; - -/** - * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the - * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template - * parameter pack. - */ -template class AgentT, typename... AgentParamsT> -using vsmem_helper_fallback_policy_t = - vsmem_helper_with_fallback_impl, - FallbackPolicyT, - AgentT>; - -/** - * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis, - * overwriting `64` threads per block and `1` item per thread. - */ -template class AgentT, typename... AgentParamsT> -using vsmem_helper_default_fallback_policy_t = - vsmem_helper_fallback_policy_t, AgentT, AgentParamsT...>; - } // namespace detail /** diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh new file mode 100644 index 00000000000..5eebc8533db --- /dev/null +++ b/cub/cub/util_vsmem.cuh @@ -0,0 +1,255 @@ +/****************************************************************************** + * Copyright (c) 2023-24, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * This file contains facilities that help to prevent exceeding the available shared memory per thread block + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +#include + +CUB_NAMESPACE_BEGIN + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +namespace detail +{ + +/** + * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel. + * + */ +struct vsmem_t +{ + void* gmem_ptr; +}; + +// The maximum amount of static shared memory available per thread block +// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB +static constexpr std::size_t max_smem_per_block = 48 * 1024; + +/** + * @brief Class template that helps to prevent exceeding the available shared memory per thread block. + * + * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual + * shared memory is needed. + */ +template +class vsmem_helper_impl +{ +private: + // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size` + static constexpr std::size_t line_size = 128; + + // The amount of shared memory or virtual shared memory required by the algorithm's agent + static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage); + + // Whether we need to allocate global memory-backed virtual shared memory + static constexpr bool needs_vsmem = required_smem > max_smem_per_block; + + // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory + static constexpr std::size_t padding_bytes = + (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size)); + +public: + // Type alias to be used for static temporary storage declaration within the algorithm's kernel + using static_temp_storage_t = cub::detail::conditional_t; + + // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes + static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0; + + /** + * @brief Used from within the device algorithm's kernel to get the temporary storage that can be + * passed to the agent, specialized for the case when we can use native shared memory as temporary + * storage. + */ + static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& + get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&) + { + return static_temp_storage; + } + + /** + * @brief Used from within the device algorithm's kernel to get the temporary storage that can be + * passed to the agent, specialized for the case when we can use native shared memory as temporary + * storage and taking a linear block id. + */ + static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& + get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t) + { + return static_temp_storage; + } + + /** + * @brief Used from within the device algorithm's kernel to get the temporary storage that can be + * passed to the agent, specialized for the case when we have to use global memory-backed + * virtual shared memory as temporary storage. + */ + static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& + get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem) + { + return *reinterpret_cast( + static_cast(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x)); + } + + /** + * @brief Used from within the device algorithm's kernel to get the temporary storage that can be + * passed to the agent, specialized for the case when we have to use global memory-backed + * virtual shared memory as temporary storage and taking a linear block id. + */ + static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& + get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id) + { + return *reinterpret_cast( + static_cast(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id)); + } + + /** + * @brief Hints to discard modified cache lines of the used virtual shared memory. + * modified cache lines. + * + * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is + * supposed to be reused after this function call. + */ + template ::type = 0> + static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) + { + return false; + } + + /** + * @brief Hints to discard modified cache lines of the used virtual shared memory. + * modified cache lines. + * + * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is + * supposed to be reused after this function call. + */ + template ::type = 0> + static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) + { + // Ensure all threads finished using temporary storage + CTA_SYNC(); + + const std::size_t linear_tid = threadIdx.x; + const std::size_t block_stride = line_size * blockDim.x; + + char* ptr = reinterpret_cast(&temp_storage); + auto ptr_end = ptr + vsmem_per_block; + + // 128 byte-aligned virtual shared memory discard + for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride) + { + cuda::discard_memory(thread_ptr, line_size); + } + return true; + } +}; + +template +constexpr bool use_fallback_agent() +{ + return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block) + && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block); +} + +/** + * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures: + * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check + * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory. + * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by + * global memory. + * + * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements + * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed + * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy + * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by + * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory + * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback + * agent falls within the bounds of `max_smem_per_block`. + * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy + */ +template ()> +struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl +{ + using agent_t = DefaultAgentT; + using agent_policy_t = DefaultAgentPolicyT; +}; +template +struct vsmem_helper_with_fallback_impl + : public vsmem_helper_impl +{ + using agent_t = FallbackAgentT; + using agent_policy_t = FallbackAgentPolicyT; +}; + +/** + * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the + * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template + * parameter pack. + */ +template class AgentT, typename... AgentParamsT> +using vsmem_helper_fallback_policy_t = + vsmem_helper_with_fallback_impl, + FallbackPolicyT, + AgentT>; + +/** + * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis, + * overwriting `64` threads per block and `1` item per thread. + */ +template class AgentT, typename... AgentParamsT> +using vsmem_helper_default_fallback_policy_t = + vsmem_helper_fallback_policy_t, AgentT, AgentParamsT...>; + +} // namespace detail + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index 1eab3d31ad2..b8d3f7e37be 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -31,6 +31,7 @@ #include #include #include +#include #include "catch2/catch.hpp" #include "catch2_test_helper.h" diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory index 5893bf6108e..cc4963874ae 100644 --- a/libcudacxx/include/cuda/discard_memory +++ b/libcudacxx/include/cuda/discard_memory @@ -22,7 +22,6 @@ #endif // no system header #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA From 8344490af89ebcaf7cfee66665c347ff47337670 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:09:10 +0200 Subject: [PATCH 2/9] Add missing includes of `cuda::std::min` and `cuda::std::max` to functional until we get `` --- libcudacxx/include/cuda/functional | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional index 955631e23a5..7820c8352cc 100644 --- a/libcudacxx/include/cuda/functional +++ b/libcudacxx/include/cuda/functional @@ -68,6 +68,8 @@ #include #include #include +#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA namespace __detail From 24568f64de458a35073c0c899e424a3a8ad5badf Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:09:40 +0200 Subject: [PATCH 3/9] Add missing include of `` to `cuda_pinned_memory_resource` --- .../include/cuda/__memory_resource/cuda_pinned_memory_resource.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h index b89f6cdcb17..1597cd2bcf5 100644 --- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h @@ -25,6 +25,7 @@ # if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_CUDA_COMPILER_NVHPC) # include +# include # endif // !_CCCL_CUDA_COMPILER_NVCC && !_CCCL_CUDA_COMPILER_NVHPC # include From ab4cf5090f6ad8e6d8cd671b1b2554b6cb0213ec Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:09:59 +0200 Subject: [PATCH 4/9] Mark `_CCCL_FORCEINLINE` as inline on host --- libcudacxx/include/cuda/std/__cccl/execution_space.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/execution_space.h b/libcudacxx/include/cuda/std/__cccl/execution_space.h index d72b08462dd..499f480717a 100644 --- a/libcudacxx/include/cuda/std/__cccl/execution_space.h +++ b/libcudacxx/include/cuda/std/__cccl/execution_space.h @@ -27,12 +27,12 @@ # define _CCCL_DEVICE __device__ # define _CCCL_HOST_DEVICE __host__ __device__ # define _CCCL_FORCEINLINE __forceinline__ -#else // ^^^ __CUDACC__ || _NVHPC_CUDA ^^^ / vvv !__CUDACC__ && !_NVHPC_CUDA +#else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER # define _CCCL_HOST # define _CCCL_DEVICE # define _CCCL_HOST_DEVICE -# define _CCCL_FORCEINLINE -#endif // !__CUDACC__ && !_NVHPC_CUDA +# define _CCCL_FORCEINLINE inline +#endif // !_CCCL_CUDA_COMPILER #if !defined(_CCCL_EXEC_CHECK_DISABLE) # if defined(_CCCL_CUDA_COMPILER_NVCC) From 463aecd43ce3296656728b2d3a769ee9882b21c0 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:10:23 +0200 Subject: [PATCH 5/9] Avoid copying output iterators in `thrust::copy_if` --- thrust/thrust/system/cuda/detail/copy_if.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h index b36285b2f2a..7706a6ac2a4 100644 --- a/thrust/thrust/system/cuda/detail/copy_if.h +++ b/thrust/thrust/system/cuda/detail/copy_if.h @@ -45,6 +45,7 @@ # include # include +# include # include # include # include @@ -95,10 +96,9 @@ struct DispatchCopyIf size_t& temp_storage_bytes, InputIt first, StencilIt stencil, - OutputIt output, + OutputIt& output, Predicate predicate, - OffsetT num_items, - OutputIt& output_end) + OffsetT num_items) { using num_selected_out_it_t = OffsetT*; using equality_op_t = cub::NullType; @@ -147,7 +147,6 @@ struct DispatchCopyIf // Return for empty problems if (num_items == 0) { - output_end = output; return status; } @@ -180,8 +179,7 @@ struct DispatchCopyIf status = cuda_cub::synchronize(policy); CUDA_CUB_RET_IF_FAIL(status); OffsetT num_selected = get_value(policy, d_num_selected_out); - - output_end = output + num_selected; + thrust::advance(output, num_selected); return status; } }; @@ -197,8 +195,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( { using size_type = typename iterator_traits::difference_type; - size_type num_items = static_cast(thrust::distance(first, last)); - OutputIt output_end{}; + size_type num_items = static_cast(thrust::distance(first, last)); cudaError_t status = cudaSuccess; size_t temp_storage_bytes = 0; @@ -214,7 +211,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( dispatch32_t::dispatch, dispatch64_t::dispatch, num_items, - (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed, output_end)); + (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed)); cuda_cub::throw_on_error(status, "copy_if failed on 1st step"); // Allocate temporary storage. @@ -227,10 +224,10 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( dispatch32_t::dispatch, dispatch64_t::dispatch, num_items, - (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed, output_end)); + (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed)); cuda_cub::throw_on_error(status, "copy_if failed on 2nd step"); - return output_end; + return output; } } // namespace detail From 162a1a76364904eb971bf26adbb229b1aa067f44 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Apr 2024 19:10:51 +0200 Subject: [PATCH 6/9] Try to ensure that `thrust::tuple` and `thrust::pair` work with CTAD --- libcudacxx/include/cuda/std/__utility/pair.h | 47 ++++---- .../cuda/std/detail/libcxx/include/tuple | 13 --- thrust/testing/pair.cu | 14 ++- thrust/testing/tuple.cu | 16 ++- thrust/thrust/detail/functional/actor.h | 2 +- thrust/thrust/detail/functional/actor.inl | 5 - .../detail/tuple_of_iterator_references.h | 14 +-- thrust/thrust/optional.h | 7 +- thrust/thrust/pair.h | 86 +++++++++++++- thrust/thrust/tuple.h | 107 ++++++++++++++++-- 10 files changed, 242 insertions(+), 69 deletions(-) diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h index 9269a09c745..8b17881446a 100644 --- a/libcudacxx/include/cuda/std/__utility/pair.h +++ b/libcudacxx/include/cuda/std/__utility/pair.h @@ -283,18 +283,21 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> : __base(__t1, __t2) {} - template ::template __constructible<_U1, _U2>, + template + using __pair_constructible = typename __pair_constraints<_T1, _T2>::template __constructible<_U1, _U2>; + + template , __enable_if_t<_Constraints::__explicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr pair(_U1&& __u1, _U2&& __u2) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) : __base(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2)) {} - template ::template __constructible<_U1, _U2>, + template , __enable_if_t<_Constraints::__implicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY constexpr pair(_U1&& __u1, _U2&& __u2) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -316,9 +319,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> pair(pair const&) = default; pair(pair&&) = default; - template ::template __constructible, + template , __enable_if_t<_Constraints::__explicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&) @@ -326,9 +329,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> : __base(__p.first, __p.second) {} - template ::template __constructible, + template , __enable_if_t<_Constraints::__implicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&) @@ -337,18 +340,18 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> {} // move constructors - template ::template __constructible<_U1, _U2>, + template , __enable_if_t<_Constraints::__explicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) : __base(_CUDA_VSTD::forward<_U1>(__p.first), _CUDA_VSTD::forward<_U2>(__p.second)) {} - template ::template __constructible<_U1, _U2>, + template , __enable_if_t<_Constraints::__implicit_constructible, int> = 0> _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -359,7 +362,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> #if defined(__cuda_std__) && !defined(_CCCL_COMPILER_NVRTC) template ::template __constructible, + class _Constraints = __pair_constructible, __enable_if_t<_Constraints::__explicit_constructible, int> = 0> _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(const ::std::pair<_U1, _U2>& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&) @@ -369,7 +372,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> template ::template __constructible, + class _Constraints = __pair_constructible, __enable_if_t<_Constraints::__implicit_constructible, int> = 0> _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(const ::std::pair<_U1, _U2>& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, const _U1&) @@ -379,7 +382,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> template ::template __constructible<_U1, _U2>, + class _Constraints = __pair_constructible<_U1, _U2>, __enable_if_t<_Constraints::__explicit_constructible, int> = 0> _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(::std::pair<_U1, _U2>&& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -388,7 +391,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2> template ::template __constructible<_U1, _U2>, + class _Constraints = __pair_constructible<_U1, _U2>, __enable_if_t<_Constraints::__implicit_constructible, int> = 0> _CCCL_HOST _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(::std::pair<_U1, _U2>&& __p) noexcept( _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T1, _U1) && _LIBCUDACXX_TRAIT(is_nothrow_constructible, _T2, _U2)) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple index 22cb1d51fe8..f1dadac04b7 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple @@ -194,10 +194,6 @@ template _LIBCUDACXX_BEGIN_NAMESPACE_STD -template -struct __is_tuple_of_iterator_references : false_type -{}; - // __tuple_leaf struct __tuple_leaf_default_constructor_tag {}; @@ -808,15 +804,6 @@ public: typename __tuple_constraints<_Tp...>::template __tuple_like_constraints<_Tuple>, __invalid_tuple_constraints>; - // Horrible hack to make tuple_of_iterator_references work - template ::value, int> = 0, - __enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(_Tp)), int> = 0> - _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t) - : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t).template __to_tuple<_Tp...>( - __make_tuple_indices_t())) - {} - template , __enable_if_t::value, int> = 0, diff --git a/thrust/testing/pair.cu b/thrust/testing/pair.cu index 1f1ddcf4d2e..cab025444bd 100644 --- a/thrust/testing/pair.cu +++ b/thrust/testing/pair.cu @@ -322,4 +322,16 @@ void TestPairStructuredBindings(void) ASSERT_EQUAL(b, b2); } DECLARE_UNITTEST(TestPairStructuredBindings); -#endif + +void TestPairCTAD(void) +{ + const int a = 42; + const int b = 1337; + thrust::pair p(a, b); + + auto [a2, b2] = p; + ASSERT_EQUAL(a, a2); + ASSERT_EQUAL(b, b2); +} +DECLARE_UNITTEST(TestPairCTAD); +#endif // _CCCL_STD_VER >= 2017 diff --git a/thrust/testing/tuple.cu b/thrust/testing/tuple.cu index c3cb1b23bab..879d920a011 100644 --- a/thrust/testing/tuple.cu +++ b/thrust/testing/tuple.cu @@ -511,7 +511,21 @@ void TestTupleStructuredBindings(void) ASSERT_EQUAL(c, c2); } DECLARE_UNITTEST(TestTupleStructuredBindings); -#endif + +void TestTupleCTAD(void) +{ + const int a = 0; + const char b = 42; + const short c = 1337; + thrust::tuple t(a, b, c); + + auto [a2, b2, c2] = t; + ASSERT_EQUAL(a, a2); + ASSERT_EQUAL(b, b2); + ASSERT_EQUAL(c, c2); +} +DECLARE_UNITTEST(TestTupleCTAD); +#endif // _CCCL_STD_VER >= 2017 // Ensure that we are backwards compatible with the old thrust::tuple implementation static_assert( diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h index 0dd0560e69e..3f30d0a1570 100644 --- a/thrust/thrust/detail/functional/actor.h +++ b/thrust/thrust/detail/functional/actor.h @@ -65,7 +65,7 @@ struct actor : Eval { typedef Eval eval_type; - _CCCL_HOST_DEVICE constexpr actor(); + constexpr actor() = default; _CCCL_HOST_DEVICE actor(const Eval& base); diff --git a/thrust/thrust/detail/functional/actor.inl b/thrust/thrust/detail/functional/actor.inl index 81146a594c6..64d367ed15f 100644 --- a/thrust/thrust/detail/functional/actor.inl +++ b/thrust/thrust/detail/functional/actor.inl @@ -48,11 +48,6 @@ namespace detail namespace functional { -template -_CCCL_HOST_DEVICE constexpr actor::actor() - : eval_type() -{} - template _CCCL_HOST_DEVICE actor::actor(const Eval& base) : eval_type(base) diff --git a/thrust/thrust/iterator/detail/tuple_of_iterator_references.h b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h index 574715e4a77..bb43c9d1c4b 100644 --- a/thrust/thrust/iterator/detail/tuple_of_iterator_references.h +++ b/thrust/thrust/iterator/detail/tuple_of_iterator_references.h @@ -67,9 +67,7 @@ class tuple_of_iterator_references : public thrust::tuple using super_t = thrust::tuple; using super_t::super_t; - inline _CCCL_HOST_DEVICE tuple_of_iterator_references() - : super_t() - {} + tuple_of_iterator_references() = default; // allow implicit construction from tuple inline _CCCL_HOST_DEVICE tuple_of_iterator_references(const super_t& other) @@ -138,15 +136,15 @@ class tuple_of_iterator_references : public thrust::tuple } // namespace detail -THRUST_NAMESPACE_END - -_LIBCUDACXX_BEGIN_NAMESPACE_STD - template struct __is_tuple_of_iterator_references> - : integral_constant + : _CUDA_VSTD::true_type {}; +THRUST_NAMESPACE_END + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + // define tuple_size, tuple_element, etc. template struct tuple_size> diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h index be186a9d78a..f7822324907 100644 --- a/thrust/thrust/optional.h +++ b/thrust/thrust/optional.h @@ -838,7 +838,12 @@ struct nullopt_t /// void foo (thrust::optional); /// foo(thrust::nullopt); //pass an empty optional /// ``` -static constexpr nullopt_t nullopt{nullopt_t::do_not_use{}, nullopt_t::do_not_use{}}; +#ifdef __CUDA_ARCH__ +__device__ static _LIBCUDACXX_CONSTEXPR_GLOBAL +#else +static constexpr +#endif // __CUDA_ARCH__ + nullopt_t nullopt{nullopt_t::do_not_use{}, nullopt_t::do_not_use{}}; class bad_optional_access : public std::exception { diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h index 4fd2e8f8333..53d5cc93edf 100644 --- a/thrust/thrust/pair.h +++ b/thrust/thrust/pair.h @@ -50,7 +50,7 @@ THRUST_NAMESPACE_BEGIN * \tparam T A \c pair type of interest. */ template -using tuple_element = ::cuda::std::tuple_element; +using tuple_element = _CUDA_VSTD::tuple_element; /*! This convenience metafunction is included for compatibility with * \p tuple. It returns \c 2, the number of elements of a \p pair, @@ -59,7 +59,7 @@ using tuple_element = ::cuda::std::tuple_element; * \tparam Pair A \c pair type of interest. */ template -using tuple_size = ::cuda::std::tuple_size; +using tuple_size = _CUDA_VSTD::tuple_size; /*! \p pair is a generic data structure encapsulating a heterogeneous * pair of values. @@ -73,10 +73,49 @@ using tuple_size = ::cuda::std::tuple_size; * provided by pair::second_type. */ template -using pair = ::cuda::std::pair; - -using ::cuda::std::get; -using ::cuda::std::make_pair; +struct pair : public _CUDA_VSTD::pair +{ + using super_t = _CUDA_VSTD::pair; + using super_t::super_t; + +#if (defined(_CCCL_COMPILER_GCC) && __GNUC__ < 9) || (defined(_CCCL_COMPILER_CLANG) && __clang_major__ < 12) + // For whatever reason nvcc complains about that constructor being used before being defined in a constexpr variable + constexpr pair() = default; + + template ::template __constructible<_U1, _U2>, + _CUDA_VSTD::__enable_if_t<_Constraints::__implicit_constructible, int> = 0> + _CCCL_HOST_DEVICE constexpr pair(_U1&& __u1, _U2&& __u2) + : super_t(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2)) + {} +#endif // _CCCL_COMPILER_GCC < 9 || _CCCL_COMPILER_CLANG < 12 +}; + +#if _CCCL_STD_VER >= 2017 +template +_CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>; +#endif // _CCCL_STD_VER >= 2017 + +template +inline _CCCL_HOST_DEVICE + _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__is_swappable::value && _CUDA_VSTD::__is_swappable::value, void> + swap(pair& lhs, pair& rhs) noexcept( + (_CUDA_VSTD::__is_nothrow_swappable::value && _CUDA_VSTD::__is_nothrow_swappable::value)) +{ + lhs.swap(rhs); +} + +template +inline _CCCL_HOST_DEVICE + pair::type, typename _CUDA_VSTD::__unwrap_ref_decay::type> + make_pair(T1&& t1, T2&& t2) +{ + return pair::type, typename _CUDA_VSTD::__unwrap_ref_decay::type>( + _CUDA_VSTD::forward(t1), _CUDA_VSTD::forward(t2)); +} + +using _CUDA_VSTD::get; /*! \endcond */ @@ -88,3 +127,38 @@ using ::cuda::std::make_pair; */ THRUST_NAMESPACE_END + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +struct tuple_size> : tuple_size> +{}; + +template +struct tuple_element> : tuple_element> +{}; + +template +struct __tuple_like_ext> : true_type +{}; + +_LIBCUDACXX_END_NAMESPACE_STD + +// This is a workaround for the fact that structured bindings require that the specializations of +// `tuple_size` and `tuple_element` reside in namespace std (https://eel.is/c++draft/dcl.struct.bind#4). +// See https://github.com/NVIDIA/libcudacxx/issues/316 for a short discussion +#if _CCCL_STD_VER >= 2017 + +# include + +namespace std +{ +template +struct tuple_size> : tuple_size> +{}; + +template +struct tuple_element> : tuple_element> +{}; +} // namespace std +#endif // _CCCL_STD_VER >= 2017 diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index 5fa1d7f2f7c..4313df83fbd 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -94,28 +94,32 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&) * \tparam N This parameter selects the element of interest. * \tparam T A \c tuple type of interest. * - * \see pair + * \see tuple * \see tuple */ template -using tuple_element = ::cuda::std::tuple_element; +using tuple_element = _CUDA_VSTD::tuple_element; /*! This metafunction returns the number of elements * of a \p tuple type of interest. * * \tparam T A \c tuple type of interest. * - * \see pair + * \see tuple * \see tuple */ template -using tuple_size = ::cuda::std::tuple_size; +using tuple_size = _CUDA_VSTD::tuple_size; + +template +struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type +{}; /*! \brief \p tuple is a class template that can be instantiated with up to ten * arguments. Each template argument specifies the type of element in the \p * tuple. Consequently, tuples are heterogeneous, fixed-size collections of * values. An instantiation of \p tuple with two arguments is similar to an - * instantiation of \p pair with the same two arguments. Individual elements + * instantiation of \p tuple with the same two arguments. Individual elements * of a \p tuple may be accessed with the \p get function. * * \tparam TN The type of the N \c tuple element. Thrust's \p tuple @@ -143,19 +147,72 @@ using tuple_size = ::cuda::std::tuple_size; * } * \endcode * - * \see pair + * \see tuple * \see get * \see make_tuple * \see tuple_element * \see tuple_size * \see tie */ -template -using tuple = ::cuda::std::tuple; +template +struct tuple : public _CUDA_VSTD::tuple +{ + using super_t = _CUDA_VSTD::tuple; + using super_t::super_t; + + tuple() = default; + + template ::value, int> = 0, + _CUDA_VSTD::__enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(Ts)), int> = 0> + _CCCL_HOST_DEVICE tuple(_TupleOfIteratorReferences&& tup) + : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(tup).template __to_tuple( + _CUDA_VSTD::__make_tuple_indices_t())) + {} + + _CCCL_EXEC_CHECK_DISABLE + template ::value, int> = 0> + _CCCL_HOST_DEVICE tuple& operator=(TupleLike&& other) + { + super_t::operator=(_CUDA_VSTD::forward(other)); + return *this; + } +}; + +#if _CCCL_STD_VER >= 2017 +template +_CCCL_HOST_DEVICE tuple(Ts...) -> tuple; + +template +struct pair; + +template +_CCCL_HOST_DEVICE tuple(pair) -> tuple; +#endif // _CCCL_STD_VER >= 2017 + +template +inline _CCCL_HOST_DEVICE + _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__all<_CUDA_VSTD::__is_swappable::value...>::value, void> + swap(tuple& __x, + tuple& __y) noexcept((_CUDA_VSTD::__all<_CUDA_VSTD::__is_nothrow_swappable::value...>::value)) +{ + __x.swap(__y); +} + +template +inline _CCCL_HOST_DEVICE tuple::type...> make_tuple(Ts&&... __t) +{ + return tuple::type...>(_CUDA_VSTD::forward(__t)...); +} -using ::cuda::std::get; -using ::cuda::std::make_tuple; -using ::cuda::std::tie; +template +inline _CCCL_HOST_DEVICE tuple tie(Ts&... ts) noexcept +{ + return tuple(ts...); +} + +using _CUDA_VSTD::get; /*! \endcond */ @@ -170,6 +227,18 @@ THRUST_NAMESPACE_END _LIBCUDACXX_BEGIN_NAMESPACE_STD +template +struct tuple_size> : tuple_size> +{}; + +template +struct tuple_element> : tuple_element> +{}; + +template +struct __tuple_like_ext> : true_type +{}; + template <> struct tuple_size= 2017 +namespace std +{ +template +struct tuple_size> : tuple_size> +{}; + +template +struct tuple_element> : tuple_element> +{}; +} // namespace std +#endif // _CCCL_STD_VER >= 2017 From fe3a98bf844600cb078343ab3b010379f6f2ca0a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 2 May 2024 19:03:00 +0200 Subject: [PATCH 7/9] Add workaround for MSVC2017 --- thrust/thrust/tuple.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index 4313df83fbd..f8a77850174 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -40,6 +40,7 @@ #endif // no system header #include +#include #include #include @@ -178,6 +179,23 @@ struct tuple : public _CUDA_VSTD::tuple super_t::operator=(_CUDA_VSTD::forward(other)); return *this; } + +#if defined(_CCCL_COMPILER_MSVC_2017) + // MSVC2017 needs some help to convert tuples + template , tuple>::value, int> = 0, + _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::__tuple_convertible<_CUDA_VSTD::tuple, super_t>::value, int> = 0> + _CCCL_HOST_DEVICE constexpr operator tuple() + { + return __to_tuple(typename _CUDA_VSTD::__make_tuple_indices::type{}); + } + + template + _CCCL_HOST_DEVICE constexpr tuple __to_tuple(_CUDA_VSTD::__tuple_indices) const + { + return tuple{_CUDA_VSTD::get(*this)...}; + } +#endif // _CCCL_COMPILER_MSVC_2017 }; #if _CCCL_STD_VER >= 2017 From 2eea114e37763af5718129de9d6961e8f70ab269 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 3 May 2024 16:39:30 +0200 Subject: [PATCH 8/9] Fix docs --- thrust/thrust/tuple.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index f8a77850174..9a69a3b1c2b 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -95,7 +95,7 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&) * \tparam N This parameter selects the element of interest. * \tparam T A \c tuple type of interest. * - * \see tuple + * \see pair * \see tuple */ template @@ -120,7 +120,7 @@ struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type * arguments. Each template argument specifies the type of element in the \p * tuple. Consequently, tuples are heterogeneous, fixed-size collections of * values. An instantiation of \p tuple with two arguments is similar to an - * instantiation of \p tuple with the same two arguments. Individual elements + * instantiation of \p pair with the same two arguments. Individual elements * of a \p tuple may be accessed with the \p get function. * * \tparam TN The type of the N \c tuple element. Thrust's \p tuple @@ -148,7 +148,7 @@ struct __is_tuple_of_iterator_references : _CUDA_VSTD::false_type * } * \endcode * - * \see tuple + * \see pair * \see get * \see make_tuple * \see tuple_element From a85236d934bf7fd4ea4041a0ddc81c5d053ac28c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 6 May 2024 08:44:13 +0200 Subject: [PATCH 9/9] Fix comment --- thrust/thrust/tuple.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index 9a69a3b1c2b..701da426b3d 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -106,7 +106,7 @@ using tuple_element = _CUDA_VSTD::tuple_element; * * \tparam T A \c tuple type of interest. * - * \see tuple + * \see pair * \see tuple */ template