From a3937a6c80e9b7dcc6bc9bbae3289aace47a4593 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sun, 4 Sep 2022 13:05:13 +0400 Subject: [PATCH] Wrap launch bounds --- cub/detail/detect_cuda_runtime.cuh | 9 +++++++++ cub/device/dispatch/dispatch_histogram.cuh | 2 +- cub/device/dispatch/dispatch_merge_sort.cuh | 6 ++++-- cub/device/dispatch/dispatch_radix_sort.cuh | 18 +++++++++--------- cub/device/dispatch/dispatch_reduce.cuh | 6 +++--- cub/device/dispatch/dispatch_reduce_by_key.cuh | 2 +- cub/device/dispatch/dispatch_rle.cuh | 2 +- cub/device/dispatch/dispatch_scan.cuh | 2 +- cub/device/dispatch/dispatch_scan_by_key.cuh | 2 +- .../dispatch/dispatch_segmented_sort.cuh | 8 ++++---- cub/device/dispatch/dispatch_select_if.cuh | 2 +- cub/device/dispatch/dispatch_spmv_orig.cuh | 4 ++-- .../dispatch/dispatch_three_way_partition.cuh | 2 +- cub/device/dispatch/dispatch_unique_by_key.cuh | 2 +- 14 files changed, 39 insertions(+), 28 deletions(-) diff --git a/cub/detail/detect_cuda_runtime.cuh b/cub/detail/detect_cuda_runtime.cuh index 7ac947277d..af185d21f0 100644 --- a/cub/detail/detect_cuda_runtime.cuh +++ b/cub/detail/detect_cuda_runtime.cuh @@ -94,6 +94,15 @@ namespace detail #endif // CUB_RUNTIME_FUNCTION predefined +#ifndef CUB_DETAIL_LAUNCH_BOUNDS +#ifdef CUB_RDC_ENABLED +#define CUB_DETAIL_LAUNCH_BOUNDS(...) +#else // not defined CUB_RDC_ENABLED +#define CUB_DETAIL_LAUNCH_BOUNDS(...) \ + __launch_bounds__(__VA_ARGS__) +#endif // CUB_RDC_ENABLED +#endif // CUB_DETAIL_LAUNCH_BOUNDS + #endif // Do not document } // namespace detail diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index 00e04efd83..66e529a5de 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -102,7 +102,7 @@ template < typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel( SampleIteratorT d_samples, ///< Input data to reduce ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 5bbd3dfdd7..beb461db9d 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -49,7 +49,8 @@ template -void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) +void __global__ DeviceMergeSortBlockSortKernel(bool ping, KeyInputIteratorT keys_in, ValueInputIteratorT items_in, @@ -136,7 +137,8 @@ template -void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) +void __global__ DeviceMergeSortMergeKernel(bool ping, KeyIteratorT keys_ping, ValueIteratorT items_ping, diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 1056e604e5..24500a447e 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -75,7 +75,7 @@ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? +CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortUpsweepKernel( @@ -134,7 +134,7 @@ __global__ void DeviceRadixSortUpsweepKernel( template < typename ChainedPolicyT, ///< Chained tuning policy typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) __global__ void RadixSortScanBinsKernel( OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) int num_counts) ///< [in] Total number of bin-counts @@ -184,7 +184,7 @@ template < typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? +CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortDownsweepKernel( @@ -247,7 +247,7 @@ template < typename KeyT, ///< Key type typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceRadixSortSingleTileKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer @@ -363,7 +363,7 @@ template < typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? +CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedRadixSortKernel( @@ -536,8 +536,8 @@ template < bool IS_DESCENDING, typename KeyT, typename OffsetT> -__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) -DeviceRadixSortHistogramKernel +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) +__global__ void DeviceRadixSortHistogramKernel (OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit) { typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT; @@ -555,8 +555,8 @@ template < typename OffsetT, typename PortionOffsetT, typename AtomicOffsetT = PortionOffsetT> -__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) -DeviceRadixSortOnesweepKernel +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) +__global__ void DeviceRadixSortOnesweepKernel (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits) diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index ac434eb862..c9e1bd3c82 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -99,7 +99,7 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceReduceKernel(InputIteratorT d_in, AccumT* d_out, OffsetT num_items, @@ -178,7 +178,7 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, @@ -304,7 +304,7 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedReduceKernel( InputIteratorT d_in, OutputIteratorT d_out, diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 738eef63da..75f63264c9 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -130,7 +130,7 @@ template -__launch_bounds__(int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) __global__ +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceReduceByKeyKernel(KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index b847a18657..b7ce11acbe 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -71,7 +71,7 @@ template < typename ScanTileStateT, ///< Tile status interface type typename EqualityOpT, ///< T equality operator type typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentRlePolicyT::BLOCK_THREADS)) __global__ void DeviceRleSweepKernel( InputIteratorT d_in, ///< [in] Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 0df89b7c45..7195b43bd1 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -163,7 +163,7 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) __global__ void DeviceScanKernel(InputIteratorT d_in, OutputIteratorT d_out, ScanTileStateT tile_state, diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh index 1a4cfe7c4f..ee921fb064 100644 --- a/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -123,7 +123,7 @@ template > -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in, KeyT *d_keys_prev_in, ValuesInputIteratorT d_values_in, diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh index d00e998f16..0d90f7d16a 100644 --- a/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -103,7 +103,7 @@ template -__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) __global__ void DeviceSegmentedSortFallbackKernel( const KeyT *d_keys_in_orig, KeyT *d_keys_out_orig, @@ -298,7 +298,7 @@ template -__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) __global__ void DeviceSegmentedSortKernelSmall( unsigned int small_segments, unsigned int medium_segments, @@ -427,7 +427,7 @@ template -__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) +CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) __global__ void DeviceSegmentedSortKernelLarge( const unsigned int *d_segments_indices, const KeyT *d_keys_in_orig, @@ -685,7 +685,7 @@ template -__launch_bounds__(1) __global__ void +CUB_DETAIL_LAUNCH_BOUNDS(1) __global__ void DeviceSegmentedSortContinuationKernel( LargeKernelT large_kernel, SmallKernelT small_kernel, diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index 8ffb130b37..31e3ce1348 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -73,7 +73,7 @@ template < typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentSelectIfPolicyT::BLOCK_THREADS)) __global__ void DeviceSelectSweepKernel( InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index 8d01b0208d..be712fa88d 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -156,7 +156,7 @@ template < typename CoordinateT, ///< Merge path coordinate type bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 bool HAS_BETA> ///< Whether the input parameter Beta is 0 -__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(SpmvPolicyT::BLOCK_THREADS)) __global__ void DeviceSpmvKernel( SpmvParams spmv_params, ///< [in] SpMV input parameter bundle CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates @@ -197,7 +197,7 @@ template < typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values typename OffsetT, ///< Signed integer type for global offsets typename ScanTileStateT> ///< Tile status interface type -__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) __global__ void DeviceSegmentFixupKernel( PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh index 05ad062a4d..7cb7bb843e 100644 --- a/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -59,7 +59,7 @@ template -__launch_bounds__(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__ +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__ void DeviceThreeWayPartitionKernel(InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh index a53e62c7fc..c110834057 100644 --- a/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -58,7 +58,7 @@ template < typename ScanTileStateT, ///< Tile status interface type typename EqualityOpT, ///< Equality operator type typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS)) +CUB_DETAIL_LAUNCH_BOUNDS(int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceUniqueByKeySweepKernel( KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values