Skip to content

Commit

Permalink
Merge pull request NVIDIA#718 from senior-zero/enh-main/github/select…
Browse files Browse the repository at this point in the history
…_sm90_tuning

Tune select and partition for SM90
  • Loading branch information
gevtushenko authored Jun 20, 2023
2 parents f76fbda + c84d218 commit 81dd8c8
Show file tree
Hide file tree
Showing 4 changed files with 642 additions and 177 deletions.
143 changes: 3 additions & 140 deletions cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@

#pragma once

#include <iterator>

#include <cub/agent/agent_scan.cuh>
#include <cub/config.cuh>
#include <cub/device/dispatch/tuning/tuning_scan.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_debug.cuh>
Expand All @@ -48,6 +47,8 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -195,144 +196,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
.ConsumeRange(num_items, tile_state, start_tile);
}

/******************************************************************************
* Policy
******************************************************************************/

namespace detail
{
namespace scan
{

template <int Threads, int Items, int L2B, int L2W>
struct tuning
{
static constexpr int threads = Threads;
static constexpr int items = Items;

using delay_constructor = detail::fixed_delay_constructor_t<L2B, L2W>;
};

template <class AccumT,
bool PrimitiveOp,
bool PrimitiveAccumulator = Traits<AccumT>::PRIMITIVE,
std::size_t AccumSize = sizeof(AccumT)>
struct sm90_tuning
{
static constexpr int threads = 128;
static constexpr int items = 15;

using delay_constructor = detail::default_delay_constructor_t<AccumT>;
};

// clang-format off
template <class T> struct sm90_tuning<T, true, true, 1> : tuning<192, 22, 168, 1140> {};
template <class T> struct sm90_tuning<T, true, true, 2> : tuning<512, 12, 376, 1125> {};
template <class T> struct sm90_tuning<T, true, true, 4> : tuning<128, 24, 648, 1245> {};
template <class T> struct sm90_tuning<T, true, true, 8> : tuning<224, 24, 632, 1290> {};

template <> struct sm90_tuning<float, true, true, sizeof(float)> : tuning<128, 24, 688, 1140> {};
template <> struct sm90_tuning<double, true, true, sizeof(double)> : tuning<224, 24, 576, 1215> {};

#if CUB_IS_INT128_ENABLED
template <> struct sm90_tuning< __int128_t, true, false, sizeof(__int128_t)> : tuning<576, 21, 860, 630> {};
template <> struct sm90_tuning<__uint128_t, true, false, sizeof(__uint128_t)> : tuning<576, 21, 860, 630> {};
#endif
// clang-format on

} // namespace scan
} // namespace detail

template <typename AccumT, typename ScanOpT = Sum>
struct DeviceScanPolicy
{
// For large values, use timesliced loads/stores to fit shared memory.
static constexpr bool LargeValues = sizeof(AccumT) > 128;
static constexpr BlockLoadAlgorithm ScanTransposedLoad =
LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
: BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm ScanTransposedStore =
LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
: BLOCK_STORE_WARP_TRANSPOSE;

template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm LOAD_ALGORITHM,
CacheLoadModifier LOAD_MODIFIER,
BlockStoreAlgorithm STORE_ALGORITHM,
BlockScanAlgorithm SCAN_ALGORITHM,
typename DelayConstructorT>
using policy_t =
AgentScanPolicy<NOMINAL_BLOCK_THREADS_4B,
NOMINAL_ITEMS_PER_THREAD_4B,
ComputeT,
LOAD_ALGORITHM,
LOAD_MODIFIER,
STORE_ALGORITHM,
SCAN_ALGORITHM,
MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
DelayConstructorT>;

/// SM350
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
// GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
using ScanPolicyT = policy_t<128,
12, ///< Threads per block, items per thread
AccumT,
BLOCK_LOAD_DIRECT,
LOAD_CA,
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
BLOCK_SCAN_RAKING,
detail::default_delay_constructor_t<AccumT>>;
};

/// SM520
struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
{
// Titan X: 32.47B items/s @ 48M 32-bit T
using ScanPolicyT = policy_t<128,
12, ///< Threads per block, items per thread
AccumT,
BLOCK_LOAD_DIRECT,
LOAD_CA,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t<AccumT>>;
};

/// SM600
struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
{
using ScanPolicyT = policy_t<128,
15, ///< Threads per block, items per thread
AccumT,
ScanTransposedLoad,
LOAD_DEFAULT,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t<AccumT>>;
};

/// SM900
struct Policy900 : ChainedPolicy<900, Policy900, Policy600>
{
using tuning = detail::scan::sm90_tuning<AccumT, detail::basic_binary_op_t<ScanOpT>::value>;

using ScanPolicyT = policy_t<tuning::threads,
tuning::items,
AccumT,
ScanTransposedLoad,
LOAD_DEFAULT,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
typename tuning::delay_constructor>;
};

using MaxPolicy = Policy900;
};

/******************************************************************************
* Dispatch
******************************************************************************/
Expand Down
45 changes: 8 additions & 37 deletions cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <cub/agent/agent_select_if.cuh>
#include <cub/config.cuh>
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/device/dispatch/tuning/tuning_select_if.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>
Expand All @@ -45,11 +46,11 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <nv/target>

#include <cstdio>
#include <iterator>

#include <nv/target>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -164,34 +165,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA
}


namespace detail
{

template <class InputT, bool MayAlias>
struct device_select_policy_hub
{
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 10;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));

using SelectIfPolicyT = AgentSelectIfPolicy<128,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
MayAlias ? LOAD_CA : LOAD_LDG,
BLOCK_SCAN_WARP_SCANS,
detail::fixed_delay_constructor_t<350, 450>>;
};

using MaxPolicy = Policy350;
};

} // detail


/******************************************************************************
* Dispatch
******************************************************************************/
Expand Down Expand Up @@ -236,19 +209,17 @@ template <typename InputIteratorT,
bool KEEP_REJECTS,
bool MayAlias = false,
typename SelectedPolicy =
detail::device_select_policy_hub<cub::detail::value_t<InputIteratorT>, MayAlias>>
detail::device_select_policy_hub<cub::detail::value_t<InputIteratorT>,
cub::detail::value_t<FlagsInputIteratorT>,
OffsetT,
MayAlias,
KEEP_REJECTS>>
struct DispatchSelectIf : SelectedPolicy
{
/******************************************************************************
* Types and constants
******************************************************************************/

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;

// The flag value type
using FlagT = cub::detail::value_t<FlagsInputIteratorT>;

// Tile status descriptor interface type
using ScanTileStateT = ScanTileState<OffsetT>;

Expand Down
Loading

0 comments on commit 81dd8c8

Please sign in to comment.