From 9f721c33f5ec2cdb623b28007de384069fb23058 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:16:26 -0700 Subject: [PATCH 1/9] make DeviceScan offset type a template parameter --- cub/cub/device/device_scan.cuh | 119 ++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 54 deletions(-) diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index c8a36f0255e..6d0a4ce4b7d 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -172,19 +173,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using InitT = cub::detail::value_t; // Initial value @@ -194,13 +195,13 @@ struct DeviceScan d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue(init_value), num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -276,19 +277,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -399,7 +400,7 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -407,13 +408,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -426,7 +427,7 @@ struct DeviceScan stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -434,7 +435,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -537,27 +538,27 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -677,7 +678,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -685,13 +687,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -708,7 +710,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -716,7 +719,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -822,27 +825,35 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -931,31 +942,31 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1030,19 +1041,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1145,20 +1156,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); @@ -1230,7 +1241,7 @@ struct DeviceScan //! //! @param[in] stream //! CUDA stream to launch kernels within. - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1238,13 +1249,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using AccumT = cub::detail::accumulator_t>; constexpr bool ForceInclusive = true; @@ -1266,14 +1277,14 @@ struct DeviceScan stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1367,25 +1378,25 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { From 5f1e0224e6a2b61111500d07b4e58ace571cebd4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Jul 2024 21:56:47 -0700 Subject: [PATCH 2/9] updates tests to use device interface --- .../catch2_test_device_scan_large_offsets.cu | 41 ++++++------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9d00d89e144..0c0854e21e1 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -35,33 +35,12 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream = 0) -{ - using init_value_t = cub::detail::InputValue; - init_value_t init_value_wrapper{init_value}; - - return cub::DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); -} - -DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan); // %PARAM% TEST_LAUNCH lid 0:1:2 -// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed // List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; +using offset_types = c2h::type_list; template struct expected_sum_op @@ -106,12 +85,12 @@ try offset_t num_items_max = static_cast(num_items_max_ull); offset_t num_items_min = num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; - // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed - offset_t num_items = - GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), - take(2, random(num_items_min, num_items_max))); + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); - // Prepare input + // Prepare input (generate a series of: 0, 1, 2, ..., , 0, 1, 2, ..., , 0, 1, ...) constexpr index_t segment_size = 1000; auto index_it = thrust::make_counting_iterator(index_t{}); auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); @@ -120,8 +99,12 @@ try c2h::device_vector d_items_out(num_items); auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + c2h::device_vector d_initial_value(1); + d_initial_value[0] = item_t{}; + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + // Run test - dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items); // Ensure that we created the correct output auto expected_out_it = From 746b108c6fb936e7f42d5c97f44edbcd0fa37eff Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Jul 2024 22:14:08 -0700 Subject: [PATCH 3/9] moves thrust scan to unsigned offset types --- thrust/thrust/system/cuda/detail/scan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index e225f2cfe4e..e2530691db7 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -63,8 +63,8 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) { using AccumT = typename thrust::iterator_traits::value_type; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; @@ -113,8 +113,8 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( ScanOp scan_op) { using InputValueT = cub::detail::InputValue; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; From 994c62c9161c11e0edc6936abfe7d15c77c138ff Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 5 Aug 2024 07:12:23 -0700 Subject: [PATCH 4/9] adjusts benchmarks to account for used offset types --- cub/benchmarks/bench/scan/exclusive/base.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 65b760fba26..42897931679 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -129,7 +129,7 @@ static void basic(nvbench::state& state, nvbench::type_list) }); } -using some_offset_types = nvbench::type_list; +using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") From ebea6bdf9786a3cbd2a7767d14b266553b3b26a7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:57:36 -0700 Subject: [PATCH 5/9] uses dynamic dispatch to unsigned type --- thrust/thrust/system/cuda/detail/dispatch.h | 14 ++++++++++++++ thrust/thrust/system/cuda/detail/scan.h | 8 ++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 90c99688f7c..f1f3090f8e5 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -90,6 +90,20 @@ status = call_64 arguments; \ } +/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the +/// `count` argument. +#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + if (count <= thrust::detail::integer_traits::const_max) \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_32 arguments; \ + } \ + else \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_64 arguments; \ + } + /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. #define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index e2530691db7..418bbc2f2ee 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -72,7 +72,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -88,7 +88,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -122,7 +122,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -138,7 +138,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, From 7672dff3a751637a8d51fb05f1b2949f569d9712 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:09:52 -0700 Subject: [PATCH 6/9] adds tparam docs for NumItemsT --- cub/cub/device/device_scan.cuh | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index dbb12bad47e..2786e6e7a6f 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -153,6 +153,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -262,6 +265,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -378,6 +384,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -521,6 +530,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -656,6 +668,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -812,6 +827,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -934,6 +952,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1038,6 +1059,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1148,6 +1172,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1231,6 +1258,9 @@ struct DeviceScan //! @tparam InitValueT //! **[inferred]** Type of the `init_value` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1375,6 +1405,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to From b96a9ada66a035e85123f2277ed7f21f2303e9cd Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Aug 2024 21:16:50 -0700 Subject: [PATCH 7/9] fixes warning about different signedness comparison --- thrust/thrust/system/cuda/detail/dispatch.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index f1f3090f8e5..02170b1c1be 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -92,16 +92,17 @@ /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the /// `count` argument. -#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ - if (count <= thrust::detail::integer_traits::const_max) \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_32 arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_64 arguments; \ +#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + if (static_cast(count) \ + <= static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_32 arguments; \ + } \ + else \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_64 arguments; \ } /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. From a13e9b50ed8dad0dbecc9a55f46aea26496cba3d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Aug 2024 22:08:02 -0700 Subject: [PATCH 8/9] adds check for negative num_items in thrust::scan --- thrust/thrust/detail/integer_math.h | 14 ++++++++++++++ thrust/thrust/system/cuda/detail/dispatch.h | 2 +- thrust/thrust/system/cuda/detail/scan.h | 11 +++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index ab37d9a3a9f..34248db4627 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -27,6 +27,8 @@ #endif // no system header #include +#include + #include #include @@ -60,6 +62,18 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x) return 0 == (x & (x - 1)); } +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +{ + return x < 0; +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +{ + return false; +} + template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_odd(Integer x) { diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 02170b1c1be..971b93d6281 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -91,7 +91,7 @@ } /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the -/// `count` argument. +/// `count` argument. `count` must not be negative. #define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ if (static_cast(count) \ <= static_cast(thrust::detail::integer_traits::const_max)) \ diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index 418bbc2f2ee..e9405776db7 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -42,6 +42,7 @@ # include +# include # include # include # include @@ -69,6 +70,11 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { @@ -119,6 +125,11 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { From 11c67ba38114f059a29060b6dc1db7ae20a0b05f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:57:29 -0700 Subject: [PATCH 9/9] fixes unused param in is_negative --- thrust/thrust/detail/integer_math.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index 34248db4627..730b0847674 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -69,7 +69,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::v } template -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T) { return false; }