Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds new virtual shared memory facility to DeviceSelect::UniqueByKey #1197

Merged
merged 8 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 85 additions & 69 deletions cub/cub/device/dispatch/dispatch_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
#include <cub/agent/agent_unique_by_key.cuh>
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/device/dispatch/tuning/tuning_unique_by_key.cuh>
#include <cub/config.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>

#include <iterator>
Expand Down Expand Up @@ -112,6 +112,9 @@ CUB_NAMESPACE_BEGIN
*
* @param[in] num_tiles
* Total number of tiles for the entire problem
*
* @param[in] vsmem
* Memory to support virtual shared memory
*/
template <typename ChainedPolicyT,
typename KeyInputIteratorT,
Expand All @@ -122,40 +125,58 @@ template <typename ChainedPolicyT,
typename ScanTileStateT,
typename EqualityOpT,
typename OffsetT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES
void DeviceUniqueByKeySweepKernel(KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
ScanTileStateT tile_state,
EqualityOpT equality_op,
OffsetT num_items,
int num_tiles)
__launch_bounds__(int(
cub::detail::vsmem_helper_default_fallback_policy_t<
typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
AgentUniqueByKey,
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
EqualityOpT,
OffsetT>::agent_policy_t::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
ScanTileStateT tile_state,
EqualityOpT equality_op,
OffsetT num_items,
int num_tiles,
cub::detail::vsmem_t vsmem)
{
using AgentUniqueByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT;

// Thread block type for selecting data from input tiles
using AgentUniqueByKeyT = AgentUniqueByKey<AgentUniqueByKeyPolicyT,
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
EqualityOpT,
OffsetT>;

// Shared memory for AgentUniqueByKey
__shared__ typename AgentUniqueByKeyT::TempStorage temp_storage;

// Process tiles
AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange(
num_tiles,
tile_state,
d_num_selected_out);
using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
AgentUniqueByKey,
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
EqualityOpT,
OffsetT>;

using AgentUniqueByKeyPolicyT = typename VsmemHelperT::agent_policy_t;

// Thread block type for selecting data from input tiles
using AgentUniqueByKeyT = typename VsmemHelperT::agent_t;

// Static shared memory allocation
__shared__ typename VsmemHelperT::static_temp_storage_t static_temp_storage;

// Get temporary storage
typename AgentUniqueByKeyT::TempStorage& temp_storage =
VsmemHelperT::get_temp_storage(static_temp_storage, vsmem, (blockIdx.x * gridDim.y) + blockIdx.y);
elstehle marked this conversation as resolved.
Show resolved Hide resolved

// Process tiles
AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items)
.ConsumeRange(num_tiles, tile_state, d_num_selected_out);

// If applicable, hints to discard modified cache lines for vsmem
VsmemHelperT::discard_temp_storage(temp_storage);
}


/******************************************************************************
* Dispatch
******************************************************************************/
Expand Down Expand Up @@ -338,13 +359,16 @@ struct DispatchUniqueByKey : SelectedPolicy
cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
{
using Policy = typename ActivePolicyT::UniqueByKeyPolicyT;
using UniqueByKeyAgentT = AgentUniqueByKey<Policy,
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
EqualityOpT,
OffsetT>;

using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
Policy,
AgentUniqueByKey,
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
EqualityOpT,
OffsetT>;

cudaError error = cudaSuccess;
do
Expand All @@ -358,23 +382,14 @@ struct DispatchUniqueByKey : SelectedPolicy
}

// Number of input tiles
int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

// Size of virtual shared memory
int max_shmem = 0;

error = CubDebug(cudaDeviceGetAttribute(&max_shmem,
cudaDevAttrMaxSharedMemoryPerBlock,
device_ordinal));
if (cudaSuccess != error)
{
break;
}
std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles);
constexpr auto block_threads = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD;
int tile_size = block_threads * items_per_thread;
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
const auto vsmem_size = num_tiles * VsmemHelperT::vsmem_per_block;

// Specify temporary storage allocation requirements
size_t allocation_sizes[2] = {0, vshmem_size};
size_t allocation_sizes[2] = {0, vsmem_size};

// Bytes needed for tile status descriptors
error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
Expand Down Expand Up @@ -459,7 +474,7 @@ struct DispatchUniqueByKey : SelectedPolicy
int scan_sm_occupancy;
error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
scan_kernel,
Policy::BLOCK_THREADS));
block_threads));
if (cudaSuccess != error)
{
break;
Expand All @@ -470,26 +485,27 @@ struct DispatchUniqueByKey : SelectedPolicy
scan_grid_size.x,
scan_grid_size.y,
scan_grid_size.z,
Policy::BLOCK_THREADS,
block_threads,
(long long)stream,
Policy::ITEMS_PER_THREAD,
items_per_thread,
scan_sm_occupancy);
}
#endif

// Invoke select_if_kernel
error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
scan_grid_size, Policy::BLOCK_THREADS, 0, stream
).doit(scan_kernel,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
tile_state,
equality_op,
num_items,
num_tiles);
error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
.doit(scan_kernel,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
tile_state,
equality_op,
num_items,
num_tiles,
cub::detail::vsmem_t{allocations[1]});

// Check for failure to launch
error = CubDebug(error);
Expand Down
23 changes: 23 additions & 0 deletions cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,17 @@ public:
return static_temp_storage;
}

/**
* @brief Used from within the device algorithm's kernel to get the temporary storage that can be
* passed to the agent, specialized for the case when we can use native shared memory as temporary
* storage and taking a linear block id.
*/
static __device__ __forceinline__ typename AgentT::TempStorage&
get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t)
{
return static_temp_storage;
}

/**
* @brief Used from within the device algorithm's kernel to get the temporary storage that can be
* passed to the agent, specialized for the case when we have to use global memory-backed
Expand All @@ -148,6 +159,18 @@ public:
static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
}

/**
* @brief Used from within the device algorithm's kernel to get the temporary storage that can be
* passed to the agent, specialized for the case when we have to use global memory-backed
* virtual shared memory as temporary storage and taking a linear block id.
*/
static __device__ __forceinline__ typename AgentT::TempStorage&
get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id)
{
return *reinterpret_cast<typename AgentT::TempStorage*>(
static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id));
}

/**
* @brief Hints to discard modified cache lines of the used virtual shared memory.
* modified cache lines.
Expand Down
11 changes: 11 additions & 0 deletions cub/test/c2h/custom_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ public:

};

template <std::size_t TotalSize>
struct huge_data
{
template <class CustomType>
class type
{
static constexpr auto extra_member_bytes = (TotalSize - sizeof(custom_type_state_t));
std::uint8_t data[extra_member_bytes];
};
};

template <class CustomType>
class less_comparable_t
{
Expand Down
66 changes: 65 additions & 1 deletion cub/test/catch2_test_device_select_unique_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,18 @@

#include <cub/device/device_select.cuh>

#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>

#include <algorithm>

#include "catch2_test_launch_helper.h"
#include "catch2_test_helper.h"
#include "catch2_test_launch_helper.h"

template<class T>
inline T to_bound(const unsigned long long bound) {
Expand Down Expand Up @@ -66,6 +68,19 @@ inline c2h::custom_type_t<c2h::equal_comparable_t> to_bound(const unsigned long
return val;
}

template <typename HugeDataTypeT>
struct index_to_huge_type_op_t
{
template<typename ValueType>
__device__ __host__ HugeDataTypeT operator()(const ValueType& val)
{
HugeDataTypeT return_val{};
return_val.key = val;
return_val.val = val;
return return_val;
}
};

DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::UniqueByKey, select_unique_by_key);

// %PARAM% TEST_LAUNCH lid 0:1:2
Expand All @@ -86,6 +101,9 @@ using all_types = c2h::type_list<std::uint8_t,
long2,
c2h::custom_type_t<c2h::equal_comparable_t>>;

using huge_types = c2h::type_list<c2h::custom_type_t<c2h::equal_comparable_t, c2h::huge_data<128>::type>,
c2h::custom_type_t<c2h::equal_comparable_t, c2h::huge_data<256>::type>>;

using types = c2h::type_list<std::uint8_t,
std::uint32_t>;

Expand Down Expand Up @@ -338,3 +356,49 @@ CUB_TEST("DeviceSelect::UniqueByKey works with a different output type", "[devic
REQUIRE(reference_keys == keys_out);
REQUIRE(reference_vals == vals_out);
}

CUB_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types",
"[device][select_unique_by_key][vsmem]",
huge_types)
{
using type = std::uint32_t;
using val_type = typename c2h::get<0, TestType>;

const int num_items = GENERATE_COPY(take(2, random(1, 100000)));
thrust::device_vector<type> keys_in(num_items);
thrust::device_vector<type> keys_out(num_items);
thrust::device_vector<val_type> vals_out(num_items);
c2h::gen(CUB_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));

auto vals_it =
thrust::make_transform_iterator(thrust::make_counting_iterator(0U), index_to_huge_type_op_t<val_type>{});

// Needs to be device accessible
thrust::device_vector<int> num_selected_out(1, 0);
int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());

select_unique_by_key(
thrust::raw_pointer_cast(keys_in.data()),
vals_it,
thrust::raw_pointer_cast(keys_out.data()),
thrust::raw_pointer_cast(vals_out.data()),
d_first_num_selected_out,
num_items);

// Ensure that we create the same output as std
thrust::host_vector<type> reference_keys = keys_in;
thrust::host_vector<val_type> reference_vals(num_items);
thrust::copy(vals_it, vals_it + num_items, reference_vals.begin());

const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
const auto zip_end = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
const auto boundary = std::unique(zip_begin, zip_end, project_first{});
REQUIRE((boundary - zip_begin) == num_selected_out[0]);

keys_out.resize(num_selected_out[0]);
vals_out.resize(num_selected_out[0]);
reference_keys.resize(num_selected_out[0]);
reference_vals.resize(num_selected_out[0]);
REQUIRE(reference_keys == keys_out);
REQUIRE(reference_vals == vals_out);
}
Loading