Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EXPERIMENT] Use resource_ref for get/set_per_device_resource #1634

Draft
wants to merge 9 commits into
base: branch-24.10
Choose a base branch
from
53 changes: 37 additions & 16 deletions include/rmm/mr/device/aligned_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cstddef>
Expand Down Expand Up @@ -65,12 +66,36 @@ class aligned_resource_adaptor final : public device_memory_resource {
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
explicit aligned_resource_adaptor(device_async_resource_ref upstream,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}

/**
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
* @throws rmm::logic_error if `allocation_alignment` is not a power of 2
*
* @param upstream The resource used for allocating/deallocating device memory.
* @param alignment The size used for allocation alignment.
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{[upstream]() {
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
return device_async_resource_ref{*upstream};
}()},
alignment_{alignment},
alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}
Expand All @@ -90,11 +115,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }

/**
* @brief The default alignment used by the adaptor.
*/
Expand All @@ -104,8 +124,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
using lock_guard = std::lock_guard<std::mutex>;

/**
* @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
* alignment.
* @brief Allocates memory of size at least `bytes` using the upstream resource with the
* specified alignment.
*
* @throws rmm::bad_alloc if the requested allocation could not be fulfilled
* by the upstream resource.
Expand All @@ -117,10 +137,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
return upstream_->allocate(bytes, stream);
return get_upstream_resource().allocate_async(bytes, 1, stream);
}
auto const size = upstream_allocation_size(bytes);
void* pointer = upstream_->allocate(size, stream);
void* pointer = get_upstream_resource().allocate_async(size, 1, stream);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const address = reinterpret_cast<std::size_t>(pointer);
auto const aligned_address = rmm::align_up(address, alignment_);
Expand All @@ -143,7 +163,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
upstream_->deallocate(ptr, bytes, stream);
get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
} else {
{
lock_guard lock(mtx_);
Expand All @@ -153,7 +173,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
pointers_.erase(iter);
}
}
upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
}
}

Expand All @@ -174,8 +194,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
}

/**
* @brief Calculate the allocation size needed from upstream to account for alignments of both the
* size and the base pointer.
* @brief Calculate the allocation size needed from upstream to account for alignments of both
* the size and the base pointer.
*
* @param bytes The requested allocation size.
* @return Allocation size needed from upstream to align both the size and the base pointer.
Expand All @@ -186,7 +206,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
}

Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
/// The upstream resource used for satisfying allocation requests
device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()};
std::unordered_map<void*, void*> pointers_; ///< Map of aligned pointers to upstream pointers.
std::size_t alignment_; ///< The size used for allocation alignment
std::size_t alignment_threshold_; ///< The size above which allocations should be aligned
Expand Down
35 changes: 31 additions & 4 deletions include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <rmm/logger.hpp>
#include <rmm/mr/device/detail/arena.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda_runtime_api.h>

Expand Down Expand Up @@ -80,6 +81,26 @@ namespace rmm::mr {
template <typename Upstream>
class arena_memory_resource final : public device_memory_resource {
public:
/**
* @brief Construct an `arena_memory_resource`.
*
* @param upstream_mr The memory resource from which to allocate blocks for the global arena.
* @param arena_size Size in bytes of the global arena. Defaults to half of the available
* memory on the current device.
* @param dump_log_on_failure If true, dump memory log when running out of memory.
*/
explicit arena_memory_resource(device_async_resource_ref upstream_mr,
std::optional<std::size_t> arena_size = std::nullopt,
bool dump_log_on_failure = false)
: global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
{
if (dump_log_on_failure_) {
logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
// Set the level to `debug` for more detailed output.
logger_->set_level(spdlog::level::info);
}
}

/**
* @brief Construct an `arena_memory_resource`.
*
Expand All @@ -93,7 +114,13 @@ class arena_memory_resource final : public device_memory_resource {
explicit arena_memory_resource(Upstream* upstream_mr,
std::optional<std::size_t> arena_size = std::nullopt,
bool dump_log_on_failure = false)
: global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
: global_arena_{[upstream_mr]() {
RMM_EXPECTS(upstream_mr != nullptr,
"Unexpected null upstream memory resource.");
return device_async_resource_ref{*upstream_mr};
}(),
arena_size},
dump_log_on_failure_{dump_log_on_failure}
{
if (dump_log_on_failure_) {
logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
Expand All @@ -111,8 +138,8 @@ class arena_memory_resource final : public device_memory_resource {
arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;

private:
using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
using arena = rmm::mr::detail::arena::arena<Upstream>;
using global_arena = rmm::mr::detail::arena::global_arena;
using arena = rmm::mr::detail::arena::arena;

/**
* @brief Allocates memory of size at least `bytes`.
Expand Down Expand Up @@ -272,7 +299,7 @@ class arena_memory_resource final : public device_memory_resource {
std::unique_lock lock(map_mtx_);
auto thread_arena = std::make_shared<arena>(global_arena_);
thread_arenas_.emplace(thread_id, thread_arena);
thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
thread_local detail::arena::arena_cleaner cleaner{thread_arena};
return *thread_arena;
}
}
Expand Down
13 changes: 7 additions & 6 deletions include/rmm/mr/device/callback_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,13 @@ class callback_memory_resource final : public device_memory_resource {
* It is the caller's responsibility to maintain the lifetime of the pointed-to data
* for the duration of the lifetime of the `callback_memory_resource`.
*/
callback_memory_resource(allocate_callback_t allocate_callback,
deallocate_callback_t deallocate_callback,
void* allocate_callback_arg = nullptr,
void* deallocate_callback_arg = nullptr) noexcept
: allocate_callback_(allocate_callback),
deallocate_callback_(deallocate_callback),
callback_memory_resource(
allocate_callback_t allocate_callback,
deallocate_callback_t deallocate_callback,
void* allocate_callback_arg = nullptr, // NOLINT(bugprone-easily-swappable-parameters)
void* deallocate_callback_arg = nullptr) noexcept
: allocate_callback_(std::move(allocate_callback)),
deallocate_callback_(std::move(deallocate_callback)),
allocate_callback_arg_(allocate_callback_arg),
deallocate_callback_arg_(deallocate_callback_arg)
{
Expand Down
31 changes: 15 additions & 16 deletions include/rmm/mr/device/detail/arena.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <rmm/detail/error.hpp>
#include <rmm/detail/logging_assert.hpp>
#include <rmm/logger.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda_runtime_api.h>

Expand Down Expand Up @@ -492,7 +493,6 @@ inline auto max_free_size(std::set<superblock> const& superblocks)
* @tparam Upstream Memory resource to use for allocating the arena. Implements
* rmm::mr::device_memory_resource interface.
*/
template <typename Upstream>
class global_arena final {
public:
/**
Expand All @@ -504,10 +504,9 @@ class global_arena final {
* @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
* on the current device.
*/
global_arena(Upstream* upstream_mr, std::optional<std::size_t> arena_size)
global_arena(device_async_resource_ref upstream_mr, std::optional<std::size_t> arena_size)
: upstream_mr_{upstream_mr}
{
RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
auto const size =
rmm::align_down(arena_size.value_or(default_size()), rmm::CUDA_ALLOCATION_ALIGNMENT);
RMM_EXPECTS(size >= superblock::minimum_size,
Expand All @@ -528,7 +527,7 @@ class global_arena final {
~global_arena()
{
std::lock_guard lock(mtx_);
upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
upstream_mr_.deallocate(upstream_block_.pointer(), upstream_block_.size());
}

/**
Expand All @@ -537,7 +536,7 @@ class global_arena final {
* @param size The size in bytes of the allocation.
* @return bool True if the allocation should be handled by the global arena.
*/
bool handles(std::size_t size) const { return size > superblock::minimum_size; }
static bool handles(std::size_t size) { return size > superblock::minimum_size; }

/**
* @brief Acquire a superblock that can fit a block of the given size.
Expand Down Expand Up @@ -608,7 +607,7 @@ class global_arena final {
* @param stream Stream on which to perform deallocation.
* @return bool true if the allocation is found, false otherwise.
*/
bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
bool deallocate_async(void* ptr, std::size_t size, cuda_stream_view stream)
{
RMM_LOGGING_ASSERT(handles(size));
stream.synchronize_no_throw();
Expand Down Expand Up @@ -690,7 +689,7 @@ class global_arena final {
* @brief Default size of the global arena if unspecified.
* @return the default global arena size.
*/
constexpr std::size_t default_size() const
static std::size_t default_size()
{
auto const [free, total] = rmm::available_device_memory();
return free / 2;
Expand All @@ -703,7 +702,7 @@ class global_arena final {
*/
void initialize(std::size_t size)
{
upstream_block_ = {upstream_mr_->allocate(size), size};
upstream_block_ = {upstream_mr_.allocate(size), size};
superblocks_.emplace(upstream_block_.pointer(), size);
}

Expand Down Expand Up @@ -775,7 +774,7 @@ class global_arena final {
}

/// The upstream resource to allocate memory from.
Upstream* upstream_mr_;
device_async_resource_ref upstream_mr_;
/// Block allocated from upstream so that it can be quickly freed.
block upstream_block_;
/// Address-ordered set of superblocks.
Expand All @@ -793,15 +792,14 @@ class global_arena final {
* @tparam Upstream Memory resource to use for allocating the global arena. Implements
* rmm::mr::device_memory_resource interface.
*/
template <typename Upstream>
class arena {
public:
/**
* @brief Construct an `arena`.
*
* @param global_arena The global arena from which to allocate superblocks.
*/
explicit arena(global_arena<Upstream>& global_arena) : global_arena_{global_arena} {}
explicit arena(global_arena& global_arena) : global_arena_{global_arena} {}

// Disable copy (and move) semantics.
arena(arena const&) = delete;
Expand Down Expand Up @@ -835,7 +833,9 @@ class arena {
*/
bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
{
if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; }
if (global_arena::handles(size) && global_arena_.deallocate_async(ptr, size, stream)) {
return true;
}
return deallocate(ptr, size);
}

Expand Down Expand Up @@ -959,7 +959,7 @@ class arena {
}

/// The global arena to allocate superblocks from.
global_arena<Upstream>& global_arena_;
global_arena& global_arena_;
/// Acquired superblocks.
std::set<superblock> superblocks_;
/// Mutex for exclusive lock.
Expand All @@ -974,10 +974,9 @@ class arena {
* @tparam Upstream Memory resource to use for allocating the global arena. Implements
* rmm::mr::device_memory_resource interface.
*/
template <typename Upstream>
class arena_cleaner {
public:
explicit arena_cleaner(std::shared_ptr<arena<Upstream>> const& arena) : arena_(arena) {}
explicit arena_cleaner(std::shared_ptr<arena> const& arena) : arena_(arena) {}

// Disable copy (and move) semantics.
arena_cleaner(arena_cleaner const&) = delete;
Expand All @@ -995,7 +994,7 @@ class arena_cleaner {

private:
/// A non-owning pointer to the arena that may need cleaning.
std::weak_ptr<arena<Upstream>> arena_;
std::weak_ptr<arena> arena_;
};

} // namespace rmm::mr::detail::arena
Loading
Loading