diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp index a91056dfa..111103dfd 100644 --- a/include/rmm/mr/device/aligned_resource_adaptor.hpp +++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -65,12 +66,36 @@ class aligned_resource_adaptor final : public device_memory_resource { * @param alignment_threshold Only allocations with a size larger than or equal to this threshold * are aligned. */ - explicit aligned_resource_adaptor(Upstream* upstream, + explicit aligned_resource_adaptor(device_async_resource_ref upstream, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, std::size_t alignment_threshold = default_alignment_threshold) : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + RMM_EXPECTS(rmm::is_supported_alignment(alignment), + "Allocation alignment is not a power of 2."); + } + + /** + * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * @throws rmm::logic_error if `allocation_alignment` is not a power of 2 + * + * @param upstream The resource used for allocating/deallocating device memory. + * @param alignment The size used for allocation alignment. + * @param alignment_threshold Only allocations with a size larger than or equal to this threshold + * are aligned. + */ + explicit aligned_resource_adaptor(Upstream* upstream, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, + std::size_t alignment_threshold = default_alignment_threshold) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()}, + alignment_{alignment}, + alignment_threshold_{alignment_threshold} + { RMM_EXPECTS(rmm::is_supported_alignment(alignment), "Allocation alignment is not a power of 2."); } @@ -90,11 +115,6 @@ class aligned_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief The default alignment used by the adaptor. */ @@ -104,8 +124,8 @@ class aligned_resource_adaptor final : public device_memory_resource { using lock_guard = std::lock_guard; /** - * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified - * alignment. + * @brief Allocates memory of size at least `bytes` using the upstream resource with the + * specified alignment. * * @throws rmm::bad_alloc if the requested allocation could not be fulfilled * by the upstream resource. @@ -117,10 +137,10 @@ class aligned_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, 1, stream); } auto const size = upstream_allocation_size(bytes); - void* pointer = upstream_->allocate(size, stream); + void* pointer = get_upstream_resource().allocate_async(size, 1, stream); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) auto const address = reinterpret_cast(pointer); auto const aligned_address = rmm::align_up(address, alignment_); @@ -143,7 +163,7 @@ class aligned_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, 1, stream); } else { { lock_guard lock(mtx_); @@ -153,7 +173,7 @@ class aligned_resource_adaptor final : public device_memory_resource { pointers_.erase(iter); } } - upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream); + get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream); } } @@ -174,8 +194,8 @@ class aligned_resource_adaptor final : public device_memory_resource { } /** - * @brief Calculate the allocation size needed from upstream to account for alignments of both the - * size and the base pointer. + * @brief Calculate the allocation size needed from upstream to account for alignments of both + * the size and the base pointer. * * @param bytes The requested allocation size. * @return Allocation size needed from upstream to align both the size and the base pointer. @@ -186,7 +206,8 @@ class aligned_resource_adaptor final : public device_memory_resource { return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT; } - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests + /// The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; std::unordered_map pointers_; ///< Map of aligned pointers to upstream pointers. std::size_t alignment_; ///< The size used for allocation alignment std::size_t alignment_threshold_; ///< The size above which allocations should be aligned diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index b274e0c18..0860a46d5 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -80,6 +81,26 @@ namespace rmm::mr { template class arena_memory_resource final : public device_memory_resource { public: + /** + * @brief Construct an `arena_memory_resource`. + * + * @param upstream_mr The memory resource from which to allocate blocks for the global arena. + * @param arena_size Size in bytes of the global arena. Defaults to half of the available + * memory on the current device. + * @param dump_log_on_failure If true, dump memory log when running out of memory. + */ + explicit arena_memory_resource(device_async_resource_ref upstream_mr, + std::optional arena_size = std::nullopt, + bool dump_log_on_failure = false) + : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + { + if (dump_log_on_failure_) { + logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); + // Set the level to `debug` for more detailed output. + logger_->set_level(spdlog::level::info); + } + } + /** * @brief Construct an `arena_memory_resource`. * @@ -93,7 +114,13 @@ class arena_memory_resource final : public device_memory_resource { explicit arena_memory_resource(Upstream* upstream_mr, std::optional arena_size = std::nullopt, bool dump_log_on_failure = false) - : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + : global_arena_{[upstream_mr]() { + RMM_EXPECTS(upstream_mr != nullptr, + "Unexpected null upstream memory resource."); + return device_async_resource_ref{*upstream_mr}; + }(), + arena_size}, + dump_log_on_failure_{dump_log_on_failure} { if (dump_log_on_failure_) { logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); @@ -111,8 +138,8 @@ class arena_memory_resource final : public device_memory_resource { arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete; private: - using global_arena = rmm::mr::detail::arena::global_arena; - using arena = rmm::mr::detail::arena::arena; + using global_arena = rmm::mr::detail::arena::global_arena; + using arena = rmm::mr::detail::arena::arena; /** * @brief Allocates memory of size at least `bytes`. @@ -272,7 +299,7 @@ class arena_memory_resource final : public device_memory_resource { std::unique_lock lock(map_mtx_); auto thread_arena = std::make_shared(global_arena_); thread_arenas_.emplace(thread_id, thread_arena); - thread_local detail::arena::arena_cleaner cleaner{thread_arena}; + thread_local detail::arena::arena_cleaner cleaner{thread_arena}; return *thread_arena; } } diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index 1483925de..74af8679a 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -84,12 +84,13 @@ class callback_memory_resource final : public device_memory_resource { * It is the caller's responsibility to maintain the lifetime of the pointed-to data * for the duration of the lifetime of the `callback_memory_resource`. */ - callback_memory_resource(allocate_callback_t allocate_callback, - deallocate_callback_t deallocate_callback, - void* allocate_callback_arg = nullptr, - void* deallocate_callback_arg = nullptr) noexcept - : allocate_callback_(allocate_callback), - deallocate_callback_(deallocate_callback), + callback_memory_resource( + allocate_callback_t allocate_callback, + deallocate_callback_t deallocate_callback, + void* allocate_callback_arg = nullptr, // NOLINT(bugprone-easily-swappable-parameters) + void* deallocate_callback_arg = nullptr) noexcept + : allocate_callback_(std::move(allocate_callback)), + deallocate_callback_(std::move(deallocate_callback)), allocate_callback_arg_(allocate_callback_arg), deallocate_callback_arg_(deallocate_callback_arg) { diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp index c7965ca34..324b9c928 100644 --- a/include/rmm/mr/device/detail/arena.hpp +++ b/include/rmm/mr/device/detail/arena.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -492,7 +493,6 @@ inline auto max_free_size(std::set const& superblocks) * @tparam Upstream Memory resource to use for allocating the arena. Implements * rmm::mr::device_memory_resource interface. */ -template class global_arena final { public: /** @@ -504,10 +504,9 @@ class global_arena final { * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory * on the current device. */ - global_arena(Upstream* upstream_mr, std::optional arena_size) + global_arena(device_async_resource_ref upstream_mr, std::optional arena_size) : upstream_mr_{upstream_mr} { - RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer."); auto const size = rmm::align_down(arena_size.value_or(default_size()), rmm::CUDA_ALLOCATION_ALIGNMENT); RMM_EXPECTS(size >= superblock::minimum_size, @@ -528,7 +527,7 @@ class global_arena final { ~global_arena() { std::lock_guard lock(mtx_); - upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size()); + upstream_mr_.deallocate(upstream_block_.pointer(), upstream_block_.size()); } /** @@ -537,7 +536,7 @@ class global_arena final { * @param size The size in bytes of the allocation. * @return bool True if the allocation should be handled by the global arena. */ - bool handles(std::size_t size) const { return size > superblock::minimum_size; } + static bool handles(std::size_t size) { return size > superblock::minimum_size; } /** * @brief Acquire a superblock that can fit a block of the given size. @@ -608,7 +607,7 @@ class global_arena final { * @param stream Stream on which to perform deallocation. * @return bool true if the allocation is found, false otherwise. */ - bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) + bool deallocate_async(void* ptr, std::size_t size, cuda_stream_view stream) { RMM_LOGGING_ASSERT(handles(size)); stream.synchronize_no_throw(); @@ -690,7 +689,7 @@ class global_arena final { * @brief Default size of the global arena if unspecified. * @return the default global arena size. */ - constexpr std::size_t default_size() const + static std::size_t default_size() { auto const [free, total] = rmm::available_device_memory(); return free / 2; @@ -703,7 +702,7 @@ class global_arena final { */ void initialize(std::size_t size) { - upstream_block_ = {upstream_mr_->allocate(size), size}; + upstream_block_ = {upstream_mr_.allocate(size), size}; superblocks_.emplace(upstream_block_.pointer(), size); } @@ -775,7 +774,7 @@ class global_arena final { } /// The upstream resource to allocate memory from. - Upstream* upstream_mr_; + device_async_resource_ref upstream_mr_; /// Block allocated from upstream so that it can be quickly freed. block upstream_block_; /// Address-ordered set of superblocks. @@ -793,7 +792,6 @@ class global_arena final { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena { public: /** @@ -801,7 +799,7 @@ class arena { * * @param global_arena The global arena from which to allocate superblocks. */ - explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} + explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} // Disable copy (and move) semantics. arena(arena const&) = delete; @@ -835,7 +833,9 @@ class arena { */ bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) { - if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; } + if (global_arena::handles(size) && global_arena_.deallocate_async(ptr, size, stream)) { + return true; + } return deallocate(ptr, size); } @@ -959,7 +959,7 @@ class arena { } /// The global arena to allocate superblocks from. - global_arena& global_arena_; + global_arena& global_arena_; /// Acquired superblocks. std::set superblocks_; /// Mutex for exclusive lock. @@ -974,10 +974,9 @@ class arena { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena_cleaner { public: - explicit arena_cleaner(std::shared_ptr> const& arena) : arena_(arena) {} + explicit arena_cleaner(std::shared_ptr const& arena) : arena_(arena) {} // Disable copy (and move) semantics. arena_cleaner(arena_cleaner const&) = delete; @@ -995,7 +994,7 @@ class arena_cleaner { private: /// A non-owning pointer to the arena that may need cleaning. - std::weak_ptr> arena_; + std::weak_ptr arena_; }; } // namespace rmm::mr::detail::arena diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index 53bc572c2..880517807 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -103,12 +104,33 @@ class failure_callback_resource_adaptor final : public device_memory_resource { * @param callback Callback function @see failure_callback_t * @param callback_arg Extra argument passed to `callback` */ - failure_callback_resource_adaptor(Upstream* upstream, + failure_callback_resource_adaptor(device_async_resource_ref upstream, failure_callback_t callback, void* callback_arg) : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param callback Callback function @see failure_callback_t + * @param callback_arg Extra argument passed to `callback` + */ + failure_callback_resource_adaptor(Upstream* upstream, + failure_callback_t callback, + void* callback_arg) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()}, + callback_{std::move(callback)}, + callback_arg_{callback_arg} + { } failure_callback_resource_adaptor() = delete; @@ -128,11 +150,6 @@ class failure_callback_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -151,7 +168,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { while (true) { try { - ret = upstream_->allocate(bytes, stream); + ret = get_upstream_resource().allocate_async(bytes, stream); break; } catch (exception_type const& e) { if (!callback_(bytes, callback_arg_)) { throw; } @@ -169,7 +186,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -183,11 +200,12 @@ class failure_callback_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; failure_callback_t callback_; void* callback_arg_; }; diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index 76a5a31c1..0ca74720a 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ class limiting_resource_adaptor final : public device_memory_resource { * @param allocation_limit Maximum memory allowed for this allocator * @param alignment Alignment in bytes for the start of each allocated buffer */ - limiting_resource_adaptor(Upstream* upstream, + limiting_resource_adaptor(device_async_resource_ref upstream, std::size_t allocation_limit, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) : allocation_limit_{allocation_limit}, @@ -61,7 +62,29 @@ class limiting_resource_adaptor final : public device_memory_resource { alignment_(alignment), upstream_{upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new limiting resource adaptor using `upstream` to satisfy + * allocation requests and limiting the total allocation amount possible. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param allocation_limit Maximum memory allowed for this allocator + * @param alignment Alignment in bytes for the start of each allocated buffer + */ + limiting_resource_adaptor(Upstream* upstream, + std::size_t allocation_limit, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) + : allocation_limit_{allocation_limit}, + allocated_bytes_(0), + alignment_(alignment), + upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} + { } limiting_resource_adaptor() = delete; @@ -81,11 +104,6 @@ class limiting_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Query the number of bytes that have been allocated. Note that * this can not be used to know how large of an allocation is possible due @@ -126,7 +144,7 @@ class limiting_resource_adaptor final : public device_memory_resource { auto const old = allocated_bytes_.fetch_add(proposed_size); if (old + proposed_size <= allocation_limit_) { try { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, stream); } catch (...) { allocated_bytes_ -= proposed_size; throw; @@ -147,7 +165,7 @@ class limiting_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { std::size_t allocated_size = rmm::align_up(bytes, alignment_); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); allocated_bytes_ -= allocated_size; } @@ -162,7 +180,7 @@ class limiting_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto const* cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -175,8 +193,8 @@ class limiting_resource_adaptor final : public device_memory_resource { // todo: should be some way to ask the upstream... std::size_t alignment_; - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests + // The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; }; /** diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index a56a784a1..f6839f6c7 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -19,29 +19,29 @@ #include #include #include -#include +#include #include #include /** * @file per_device_resource.hpp - * @brief Management of per-device `device_memory_resource`s - * - * One might wish to construct a `device_memory_resource` and use it for (de)allocation - * without explicit dependency injection, i.e., passing a reference to that object to all places it - * is to be used. Instead, one might want to set their resource as a "default" and have it be used - * in all places where another resource has not been explicitly specified. In applications with - * multiple GPUs in the same process, it may also be necessary to maintain independent default - * resources for each device. To this end, the `set_per_device_resource` and - * `get_per_device_resource` functions enable mapping a CUDA device id to a `device_memory_resource` - * pointer. - * - * For example, given a pointer, `mr`, to a `device_memory_resource` object, calling + * @brief Management of per-device memory resources + * + * One might wish to construct a memory resource and use it for (de)allocation without explicit + * dependency injection, i.e., passing a reference to that object to all places it is to be used. + * Instead, one might want to set their resource as a "default" and have it be used in all places + * where another resource has not been explicitly specified. In applications with multiple GPUs in + * the same process, it may also be necessary to maintain independent default resources for each + * device. To this end, the `set_per_device_resource` and `get_per_device_resource` functions enable + * mapping a CUDA device id to a memory resource. + * + * For example, `rmm::device_async_resource_ref`, `mr`, to a memory resource object, calling * `set_per_device_resource(cuda_device_id{0}, mr)` will establish a mapping between CUDA device 0 * and `mr` such that all future calls to `get_per_device_resource(cuda_device_id{0})` will return - * the same pointer, `mr`. In this way, all places that use the resource returned from - * `get_per_device_resource` for (de)allocation will use the user provided resource, `mr`. + * the same `rmm::device_async_resource_ref`, `mr`. In this way, all places that use the resource + * returned from `get_per_device_resource` for (de)allocation will use the user provided resource, + * `mr`. * * @note `device_memory_resource`s make CUDA API calls without setting the current CUDA device. * Therefore a memory resource should only be used with the current CUDA device set to the device @@ -49,16 +49,16 @@ * is only valid if `id` refers to the CUDA device that was active when `mr` was created. * * If no resource was explicitly set for a given device specified by `id`, then - * `get_per_device_resource(id)` will return a pointer to a `cuda_memory_resource`. + * `get_per_device_resource(id)` will return a reference to a `cuda_memory_resource`. * * To fetch and modify the resource for the current CUDA device, `get_current_device_resource()` and * `set_current_device_resource()` will automatically use the current CUDA device id from * `cudaGetDevice()`. * - * Creating a device_memory_resource for each device requires care to set the current device - * before creating each resource, and to maintain the lifetime of the resources as long as they - * are set as per-device resources. Here is an example loop that creates `unique_ptr`s to - * pool_memory_resource objects for each device and sets them as the per-device resource for that + * Creating a memory resource for each device requires care to set the current device before + * creating each resource, and to maintain the lifetime of the resources as long as they are set as + * per-device resources. Here is an example loop that creates `unique_ptr`s to + * `pool_memory_resource` objects for each device and sets them as the per-device resource for that * device. * * @code{.cpp} @@ -102,6 +102,18 @@ inline device_memory_resource* initial_resource() return &mr; } +/** + * @brief Returns a reference to the initial resource. + * + * Returns a global instance of a `cuda_memory_resource` as a function local static. + * + * @return Pointer to the static cuda_memory_resource used as the initial, default resource + */ +inline device_async_resource_ref initial_resource_ref() +{ + return device_async_resource_ref{*initial_resource()}; +} + /** * @briefreturn{Reference to the lock} */ @@ -111,13 +123,23 @@ inline std::mutex& map_lock() return map_lock; } +// This symbol must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 +/** + * @briefreturn{Reference to the map from device id -> resource*} + */ +/*RMM_EXPORT inline auto& get_pointer_map() +{ + static std::map device_id_to_resource; + return device_id_to_resource; +}*/ + // This symbol must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 /** * @briefreturn{Reference to the map from device id -> resource} */ RMM_EXPORT inline auto& get_map() { - static std::map device_id_to_resource; + static std::map device_id_to_resource; return device_id_to_resource; } @@ -144,26 +166,35 @@ RMM_EXPORT inline auto& get_map() * @param device_id The id of the target device * @return Pointer to the current `device_memory_resource` for device `id` */ -inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) +inline device_async_resource_ref get_per_device_resource(cuda_device_id device_id) { - std::lock_guard lock{detail::map_lock()}; + /*std::lock_guard lock{detail::map_lock()}; auto& map = detail::get_map(); // If a resource was never set for `id`, set to the initial resource auto const found = map.find(device_id.value()); return (found == map.end()) ? (map[device_id.value()] = detail::initial_resource()) - : found->second; + : found->second;*/ + std::lock_guard lock{detail::map_lock()}; + auto& map = detail::get_map(); + // If a resource was never set for `id`, set to the initial resource + auto const found = map.find(device_id.value()); + if (found == map.end()) { + // + auto item = map.insert({device_id.value(), detail::initial_resource()}); + return item.first->second; + } + return found->second; } /** * @brief Set the `device_memory_resource` for the specified device. * - * If `new_mr` is not `nullptr`, sets the memory resource pointer for the device specified by `id` - * to `new_mr`. Otherwise, resets `id`s resource to the initial `cuda_memory_resource`. + * Sets the memory resource pointer for the device specified by `id` to `new_mr`. * * `id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is undefined. * - * The object pointed to by `new_mr` must outlive the last use of the resource, otherwise behavior - * is undefined. It is the caller's responsibility to maintain the lifetime of the resource + * The memory resource referenced by `new_mr` must outlive the last use of the resource, otherwise + * behavior is undefined. It is the caller's responsibility to maintain the lifetime of the resource * object. * * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, @@ -177,26 +208,112 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) * when the device_memory_resource was created. * * @param device_id The id of the target device - * @param new_mr If not `nullptr`, pointer to new `device_memory_resource` to use as new resource - * for `id` - * @return Pointer to the previous memory resource for `id` + * @param new_mr Reference to `device_memory_resource` to use as new resource for `id` + * @return Reference to the previous memory resource for `id` */ -inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, - device_memory_resource* new_mr) +inline device_async_resource_ref set_per_device_resource(cuda_device_id device_id, + device_async_resource_ref new_mr) { + // std::lock_guard lock{detail::map_lock()}; + // auto& map = detail::get_map(); + // auto const old_itr = map.find(device_id.value()); + // // If a resource didn't previously exist for `id`, return pointer to initial_resource + // auto* old_mr = (old_itr == map.end()) ? detail::initial_resource() : old_itr->second; + // map[device_id.value()] = (new_mr == nullptr) ? detail::initial_resource() : new_mr; + // return old_mr; std::lock_guard lock{detail::map_lock()}; auto& map = detail::get_map(); auto const old_itr = map.find(device_id.value()); // If a resource didn't previously exist for `id`, return pointer to initial_resource - auto* old_mr = (old_itr == map.end()) ? detail::initial_resource() : old_itr->second; - map[device_id.value()] = (new_mr == nullptr) ? detail::initial_resource() : new_mr; - return old_mr; + // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] + if (old_itr == map.end()) { + map.insert({device_id.value(), new_mr}); + std::cout << "returning initial resource in set_per_device_resource_ref\n"; + return device_async_resource_ref{detail::initial_resource()}; + } + + auto old_resource_ref = old_itr->second; + old_itr->second = new_mr; // update map directly via iterator + return old_resource_ref; +} + +/** + * @brief Reset the memory resource for the specified device to the initial resource. + * + * Resets to a reference to the initial `cuda_memory_resource`. + * + * `device_id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is + * undefined. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, `set_current_device_resource`, + * `reset_per_device_resource` and `reset_current_device_resource`. Concurrent calls to any of these + * functions will result in a valid state, but the order of execution is undefined. + * + * @param device_id The id of the target device + * @return Previous `device_async_resource_ref` for `device_id` + */ +inline device_async_resource_ref reset_per_device_resource(cuda_device_id device_id) +{ + return set_per_device_resource(device_id, detail::initial_resource()); +} + +/** + * @brief Reset the memory resource for the current device to the initial resource. + * + * Resets to a reference to the initial `cuda_memory_resource`. The "current device" is the device + * returned by `cudaGetDevice`. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, `set_current_device_resource`, + * `reset_per_device_resource` and `reset_current_device_resource. Concurrent calls to any of these + * functions will result in a valid state, but the order of execution is undefined. + * + * @return Previous `device_async_resource_ref` for `device_id` + */ +inline device_async_resource_ref reset_current_device_resource() +{ + return reset_per_device_resource(rmm::get_current_cuda_device()); +} + +/** + * @brief Set the `device_memory_resource` for the specified device. + * + * If `new_mr` is not `nullptr`, sets the memory resource for the device specified by `id` + * to `new_mr`. Otherwise, resets `id`s resource to the initial `cuda_memory_resource`. + * + * `id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is undefined. + * + * The object pointed to by `new_mr` must outlive the last use of the resource, otherwise behavior + * is undefined. It is the caller's responsibility to maintain the lifetime of the resource + * object. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. + * Concurrent calls to any of these functions will result in a valid state, but the order of + * execution is undefined. + * + * @note The resource passed in `new_mr` must have been created when device `id` was the current + * CUDA device (e.g. set using `cudaSetDevice()`). The behavior of a memory resource is + * undefined if used while the active CUDA device is a different device from the one that was active + * when the memory resource was created. + * + * @param device_id The id of the target device + * @param new_mr If not `nullptr`, pointer to new `device_memory_resource` to use as new resource + * for `id` + * @return Pointer to the previous memory resource for `id` + */ +inline device_async_resource_ref set_per_device_resource(cuda_device_id device_id, + device_memory_resource* new_mr) +{ + if (new_mr == nullptr) { return reset_per_device_resource(device_id); } + return set_per_device_resource(device_id, *new_mr); } /** * @brief Get the memory resource for the current device. * - * Returns a pointer to the resource set for the current device. The initial resource is a + * Returns a reference to the resource set for the current device. The initial resource is a * `cuda_memory_resource`. * * The "current device" is the device returned by `cudaGetDevice`. @@ -206,19 +323,47 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, * Concurrent calls to any of these functions will result in a valid state, but the order of * execution is undefined. * - * @note The returned `device_memory_resource` should only be used with the current CUDA device. + * @note The returned memory resource should only be used with the current CUDA device. * Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned resource - * can result in undefined behavior. The behavior of a device_memory_resource is undefined if used + * can result in undefined behavior. The behavior of a memory resource is undefined if used * while the active CUDA device is a different device from the one that was active when the - * device_memory_resource was created. + * memory resource was created. * * @return Pointer to the resource for the current device */ -inline device_memory_resource* get_current_device_resource() +inline device_async_resource_ref get_current_device_resource() { return get_per_device_resource(rmm::get_current_cuda_device()); } +/** + * @brief Set the memory resource for the current device. + * + * Sets the memory resource for the current device to `new_mr`. + * + * The "current device" is the device returned by `cudaGetDevice`. + * + * The object pointed to by `new_mr` must outlive the last use of the resource, otherwise behavior + * is undefined. It is the caller's responsibility to maintain the lifetime of the resource + * object. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. + * Concurrent calls to any of these functions will result in a valid state, but the order of + * execution is undefined. + * + * @note The resource passed in `new_mr` must have been created for the current CUDA device. The + * behavior of a memory resource is undefined if used while the active CUDA device is a + * different device from the one that was active when the memory resource was created. + * + * @param new_mr If not `nullptr`, pointer to new resource to use for the current device + * @return Pointer to the previous resource for the current device + */ +inline device_async_resource_ref set_current_device_resource(device_async_resource_ref new_mr) +{ + return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); +} + /** * @brief Set the memory resource for the current device. * @@ -237,15 +382,16 @@ inline device_memory_resource* get_current_device_resource() * execution is undefined. * * @note The resource passed in `new_mr` must have been created for the current CUDA device. The - * behavior of a device_memory_resource is undefined if used while the active CUDA device is a - * different device from the one that was active when the device_memory_resource was created. + * behavior of a memory resource is undefined if used while the active CUDA device is a + * different device from the one that was active when the memory resource was created. * * @param new_mr If not `nullptr`, pointer to new resource to use for the current device * @return Pointer to the previous resource for the current device */ -inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr) +inline device_async_resource_ref set_current_device_resource(device_memory_resource* new_mr) { - return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); + return set_per_device_resource(rmm::get_current_cuda_device(), *new_mr); } + /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index a3a972904..5e76aaf74 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,34 @@ class pool_memory_resource final friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `upstream_mr == nullptr` + * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of + * pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available from the upstream resource. + */ + explicit pool_memory_resource(device_async_resource_ref upstream_mr, + std::size_t initial_pool_size, + std::optional maximum_pool_size = std::nullopt) + : upstream_mr_{upstream_mr} + { + RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Initial pool size required to be a multiple of 256 bytes"); + RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Maximum pool size required to be a multiple of 256 bytes"); + + initialize_pool(initial_pool_size, maximum_pool_size); + } + /** * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. @@ -132,7 +161,7 @@ class pool_memory_resource final std::optional maximum_pool_size = std::nullopt) : upstream_mr_{[upstream_mr]() { RMM_EXPECTS(nullptr != upstream_mr, "Unexpected null upstream pointer."); - return upstream_mr; + return device_async_resource_ref{*upstream_mr}; }()} { RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), @@ -182,16 +211,11 @@ class pool_memory_resource final /** * @briefreturn{rmm::device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Computes the size of the current pool * @@ -464,7 +488,8 @@ class pool_memory_resource final } private: - Upstream* upstream_mr_; // The "heap" to allocate the pool from + // The "heap" to allocate the pool from + device_async_resource_ref upstream_mr_; std::size_t current_pool_size_{}; std::optional maximum_pool_size_{}; diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index cbf1b87d2..ed0c5a7dc 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -111,6 +112,16 @@ class statistics_resource_adaptor final : public device_memory_resource { } }; + /** + * @brief Construct a new statistics resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream Reference to the resource used for allocating/deallocating device memory + */ + statistics_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new statistics resource adaptor using `upstream` to satisfy * allocation requests. @@ -119,9 +130,12 @@ class statistics_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory */ - statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream} + statistics_resource_adaptor(Upstream* upstream) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } statistics_resource_adaptor() = delete; @@ -141,11 +155,6 @@ class statistics_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Returns a `counter` struct for this adaptor containing the current, * peak, and total number of allocated bytes for this @@ -224,7 +233,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); + void* ptr = upstream_.allocate_async(bytes, stream); // increment the stats { @@ -247,7 +256,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + upstream_.deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -269,7 +278,7 @@ class statistics_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -277,14 +286,14 @@ class statistics_resource_adaptor final : public device_memory_resource { // Invariant: the stack always contains at least one entry std::stack> counter_stack_{{std::make_pair(counter{}, counter{})}}; std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; }; /** * @brief Convenience factory to return a `statistics_resource_adaptor` around the * upstream resource `upstream`. * - * @tparam Upstream Type of the upstream `device_memory_resource`. * @param upstream Pointer to the upstream resource * @return The new statistics resource adaptor */ diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index 0d3046973..31fc46d32 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -90,10 +91,26 @@ class tracking_resource_adaptor final : public device_memory_resource { * @param upstream The resource used for allocating/deallocating device memory * @param capture_stacks If true, capture stacks for allocation calls */ - tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) + tracking_resource_adaptor(device_async_resource_ref upstream, bool capture_stacks = false) : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new tracking resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param capture_stacks If true, capture stacks for allocation calls + */ + tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) + : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} + { } tracking_resource_adaptor() = delete; @@ -113,11 +130,6 @@ class tracking_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Get the outstanding allocations map * @@ -197,8 +209,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); - + void* ptr = upstream_.allocate_async(bytes, stream); // track it. { write_lock_t lock(mtx_); @@ -218,7 +229,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + upstream_.deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -263,7 +274,7 @@ class tracking_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -271,7 +282,9 @@ class tracking_resource_adaptor final : public device_memory_resource { std::map allocations_; // map of active allocations std::atomic allocated_bytes_; // number of bytes currently allocated std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{ + rmm::mr::get_current_device_resource()}; // the upstream resource used for satisfying + // allocation requests }; /** diff --git a/python/rmm/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd index 3c3d3aa6f..66107c4d8 100644 --- a/python/rmm/rmm/_cuda/stream.pxd +++ b/python/rmm/rmm/_cuda/stream.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ from libc.stdint cimport uintptr_t from libcpp cimport bool from rmm._lib.cuda_stream_view cimport cuda_stream_view +from rmm._lib.resource_ref cimport stream_ref cdef class Stream: @@ -27,6 +28,7 @@ cdef class Stream: cdef Stream _from_cudaStream_t(cudaStream_t s, object owner=*) cdef cuda_stream_view view(self) except * nogil + cdef stream_ref ref(self) except * nogil cdef void c_synchronize(self) except * nogil cdef bool c_is_default(self) except * nogil cdef void _init_with_new_cuda_stream(self) except * diff --git a/python/rmm/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx index 4d5ff5232..19d91d624 100644 --- a/python/rmm/rmm/_cuda/stream.pyx +++ b/python/rmm/rmm/_cuda/stream.pyx @@ -23,6 +23,7 @@ from rmm._lib.cuda_stream_view cimport ( cuda_stream_per_thread, cuda_stream_view, ) +from rmm._lib.resource_ref cimport stream_ref cdef class Stream: @@ -63,6 +64,12 @@ cdef class Stream: """ return cuda_stream_view((self._cuda_stream)) + cdef stream_ref ref(self) except * nogil: + """ + Generate a cuda::stream_ref from this Stream instance + """ + return stream_ref((self._cuda_stream)) + cdef void c_synchronize(self) except * nogil: """ Synchronize the CUDA stream. diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp index dc92e4639..ab36c2150 100644 --- a/python/rmm/rmm/_lib/_torch_allocator.cpp +++ b/python/rmm/rmm/_lib/_torch_allocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,8 @@ extern "C" void* allocate(std::size_t size, int device, void* stream) rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; auto mr = rmm::mr::get_per_device_resource(device_id); - return mr->allocate(size, rmm::cuda_stream_view{static_cast(stream)}); + return mr.allocate_async( + size, rmm::CUDA_ALLOCATION_ALIGNMENT, rmm::cuda_stream_view{static_cast(stream)}); } /** @@ -56,5 +57,8 @@ extern "C" void deallocate(void* ptr, std::size_t size, int device, void* stream rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; auto mr = rmm::mr::get_per_device_resource(device_id); - mr->deallocate(ptr, size, rmm::cuda_stream_view{static_cast(stream)}); + mr.deallocate_async(ptr, + size, + rmm::CUDA_ALLOCATION_ALIGNMENT, + rmm::cuda_stream_view{static_cast(stream)}); } diff --git a/python/rmm/rmm/_lib/extern_memory_resources.pxd b/python/rmm/rmm/_lib/extern_memory_resources.pxd new file mode 100644 index 000000000..fb6df19c0 --- /dev/null +++ b/python/rmm/rmm/_lib/extern_memory_resources.pxd @@ -0,0 +1,192 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libc.stdint cimport int8_t, int64_t +from libcpp cimport bool +from libcpp.optional cimport optional + + +cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*) + ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*) + +cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + ctypedef bool (*failure_callback_t)(size_t, void*) + +# TODO: when we adopt Cython 3.0 use enum class +cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ + namespace \ + "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \ + nogil: + enum allocation_handle_type \ + "rmm::mr::cuda_async_memory_resource::allocation_handle_type": + none + posix_file_descriptor + win32 + win32_kmt + +cdef extern from "rmm/mr/device/device_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass device_memory_resource: + + device_memory_resource() except + + +cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass cuda_memory_resource(device_memory_resource): + + cuda_memory_resource() except + + +cdef extern from "rmm/mr/device/managed_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass managed_memory_resource(device_memory_resource): + + managed_memory_resource() except + + +cdef extern from "rmm/mr/device/system_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass system_memory_resource(device_memory_resource): + + system_memory_resource() except + + +cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass sam_headroom_memory_resource(device_memory_resource): + + sam_headroom_memory_resource(size_t headroom) except + + +cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass cuda_async_memory_resource(device_memory_resource): + + cuda_async_memory_resource( + optional[size_t] initial_pool_size, + optional[size_t] release_threshold, + optional[allocation_handle_type] export_handle_type + ) except + + +cdef extern from "rmm/mr/device/pool_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass pool_memory_resource[Upstream](device_memory_resource): + + size_t pool_size() + + pool_memory_resource( + device_async_resource_ref upstream_mr, + size_t initial_pool_size, + optional[size_t] maximum_pool_size + ) except + + +cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource): + + fixed_size_memory_resource( + Upstream* upstream_mr, + size_t block_size, + size_t block_to_preallocate + ) except + + +cdef extern from "rmm/mr/device/binning_memory_resource.hpp" namespace "rmm::mr" nogil: + cdef cppclass binning_memory_resource[Upstream](device_memory_resource): + + binning_memory_resource( + Upstream* upstream_mr, + int8_t min_size_exponent, + int8_t max_size_exponent) except + + + void add_bin(size_t allocation_size) except + + void add_bin( + size_t allocation_size, + device_memory_resource* bin_resource) except + + + binning_memory_resource(Upstream* upstream_mr) except + + +cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass callback_memory_resource(device_memory_resource): + + callback_memory_resource( + allocate_callback_t allocate_callback, + deallocate_callback_t deallocate_callback, + void* allocate_callback_arg, + void* deallocate_callback_arg + ) except + + +cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource): + + size_t get_allocated_bytes() except + + size_t get_allocation_limit() except + + + limiting_resource_adaptor( + Upstream* upstream_mr, + size_t allocation_limit + ) except + + +cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource): + + void flush() except + + + logging_resource_adaptor( + Upstream* upstream_mr, + string filename + ) except + + +cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource): + + struct counter: + counter() + + int64_t value + int64_t peak + int64_t total + + counter get_bytes_counter() except + + counter get_allocations_counter() except + + pair[counter, counter] pop_counters() except + + pair[counter, counter] push_counters() except + + + statistics_resource_adaptor(Upstream* upstream_mr) except + + +cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource): + + size_t get_allocated_bytes() except + + string get_outstanding_allocations_str() except + + void log_outstanding_allocations() except + + + tracking_resource_adaptor( + Upstream* upstream_mr, + bool capture_stacks + ) except + + +cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass failure_callback_resource_adaptor[Upstream](device_memory_resource): + + failure_callback_resource_adaptor( + Upstream* upstream_mr, + failure_callback_t callback, + void* callback_arg + ) except + + +cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource): + + prefetch_resource_adaptor(Upstream* upstream_mr) except + diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index 000a3fe1e..ecaa36edd 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -13,61 +13,106 @@ # limitations under the License. from libc.stdint cimport int8_t -from libcpp.memory cimport shared_ptr +from libcpp.memory cimport make_shared, shared_ptr from libcpp.pair cimport pair from libcpp.string cimport string from libcpp.vector cimport vector from rmm._lib.cuda_stream_view cimport cuda_stream_view +from rmm._lib.resource_ref cimport ( + CUDA_ALLOCATION_ALIGNMENT, + device_async_resource_ref, + stream_ref, +) +include "extern_memory_resources.pxd" + +cdef extern from *: + """ + template + rmm::device_async_resource_ref as_ref(T *p) { return p; } + """ + + device_async_resource_ref as_ref[T](T *p) noexcept nogil -cdef extern from "rmm/mr/device/device_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass device_memory_resource: - void* allocate(size_t bytes) except + - void* allocate(size_t bytes, cuda_stream_view stream) except + - void deallocate(void* ptr, size_t bytes) except + - void deallocate( - void* ptr, - size_t bytes, - cuda_stream_view stream - ) except + cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil: size_t percent_of_free_device_memory(int percent) except + pair[size_t, size_t] available_device_memory() except + + cdef class DeviceMemoryResource: - cdef shared_ptr[device_memory_resource] c_obj cdef device_memory_resource* get_mr(self) noexcept nogil + cdef shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil + cdef class UpstreamResourceAdaptor(DeviceMemoryResource): cdef readonly DeviceMemoryResource upstream_mr - cpdef DeviceMemoryResource get_upstream(self) + cdef class CudaMemoryResource(DeviceMemoryResource): - pass + cdef shared_ptr[cuda_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class ManagedMemoryResource(DeviceMemoryResource): - pass + cdef shared_ptr[managed_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class SystemMemoryResource(DeviceMemoryResource): - pass + cdef shared_ptr[system_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class SamHeadroomMemoryResource(DeviceMemoryResource): - pass + cdef shared_ptr[sam_headroom_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class CudaAsyncMemoryResource(DeviceMemoryResource): - pass + cdef shared_ptr[cuda_async_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class PoolMemoryResource(UpstreamResourceAdaptor): - pass + cdef shared_ptr[pool_memory_resource[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor): - pass + cdef shared_ptr[fixed_size_memory_resource[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class BinningMemoryResource(UpstreamResourceAdaptor): + cdef shared_ptr[binning_memory_resource[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) cdef readonly list _bin_mrs @@ -77,27 +122,66 @@ cdef class BinningMemoryResource(UpstreamResourceAdaptor): DeviceMemoryResource bin_resource=*) cdef class CallbackMemoryResource(DeviceMemoryResource): + cdef shared_ptr[callback_memory_resource] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef object _allocate_func cdef object _deallocate_func cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor): - pass + cdef shared_ptr[limiting_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor): + cdef shared_ptr[logging_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef object _log_file_name cpdef get_file_name(self) cpdef flush(self) cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): - pass + cdef shared_ptr[statistics_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): - pass + cdef shared_ptr[tracking_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): + cdef shared_ptr[failure_callback_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cdef object _callback + cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): - pass + cdef shared_ptr[prefetch_resource_adaptor[device_memory_resource]] c_obj + cdef inline device_memory_resource* get_mr(self) noexcept nogil: + return self.c_obj.get() + cdef inline shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + return make_shared[device_async_resource_ref](as_ref(self.c_obj.get())) + cpdef DeviceMemoryResource get_current_device_resource() diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 5030c5d2d..602c85d20 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -22,9 +22,8 @@ from collections import defaultdict cimport cython from cython.operator cimport dereference as deref from libc.stddef cimport size_t -from libc.stdint cimport int8_t, int64_t, uintptr_t +from libc.stdint cimport int8_t, uintptr_t from libcpp cimport bool -from libcpp.memory cimport make_unique, unique_ptr from libcpp.optional cimport optional from libcpp.pair cimport pair from libcpp.string cimport string @@ -32,7 +31,9 @@ from libcpp.string cimport string from cuda.cudart import cudaError_t from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice + from rmm._cuda.stream cimport Stream + from rmm._cuda.stream import DEFAULT_STREAM from rmm._lib.cuda_stream_view cimport cuda_stream_view @@ -44,6 +45,7 @@ from rmm._lib.per_device_resource cimport ( cuda_device_id, set_per_device_resource as cpp_set_per_device_resource, ) + from rmm.statistics import Statistics # Transparent handle of a C++ exception @@ -82,164 +84,16 @@ cdef extern from *: void throw_cpp_except(CppExcept) nogil -# NOTE: Keep extern declarations in .pyx file as much as possible to avoid -# leaking dependencies when importing RMM Cython .pxd files -cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass cuda_memory_resource(device_memory_resource): - cuda_memory_resource() except + - -cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass managed_memory_resource(device_memory_resource): - managed_memory_resource() except + - -cdef extern from "rmm/mr/device/system_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass system_memory_resource(device_memory_resource): - system_memory_resource() except + - -cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass sam_headroom_memory_resource(device_memory_resource): - sam_headroom_memory_resource(size_t headroom) except + - -cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - - cdef cppclass cuda_async_memory_resource(device_memory_resource): - cuda_async_memory_resource( - optional[size_t] initial_pool_size, - optional[size_t] release_threshold, - optional[allocation_handle_type] export_handle_type) except + - -# TODO: when we adopt Cython 3.0 use enum class -cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ - namespace \ - "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \ - nogil: - enum allocation_handle_type \ - "rmm::mr::cuda_async_memory_resource::allocation_handle_type": - none - posix_file_descriptor - win32 - win32_kmt - - -cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass pool_memory_resource[Upstream](device_memory_resource): - pool_memory_resource( - Upstream* upstream_mr, - size_t initial_pool_size, - optional[size_t] maximum_pool_size) except + - size_t pool_size() - -cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource): - fixed_size_memory_resource( - Upstream* upstream_mr, - size_t block_size, - size_t block_to_preallocate) except + - -cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*) - ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*) - - cdef cppclass callback_memory_resource(device_memory_resource): - callback_memory_resource( - allocate_callback_t allocate_callback, - deallocate_callback_t deallocate_callback, - void* allocate_callback_arg, - void* deallocate_callback_arg - ) except + - -cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass binning_memory_resource[Upstream](device_memory_resource): - binning_memory_resource(Upstream* upstream_mr) except + - binning_memory_resource( - Upstream* upstream_mr, - int8_t min_size_exponent, - int8_t max_size_exponent) except + - - void add_bin(size_t allocation_size) except + - void add_bin( - size_t allocation_size, - device_memory_resource* bin_resource) except + - -cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource): - limiting_resource_adaptor( - Upstream* upstream_mr, - size_t allocation_limit) except + - - size_t get_allocated_bytes() except + - size_t get_allocation_limit() except + - -cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource): - logging_resource_adaptor( - Upstream* upstream_mr, - string filename) except + - - void flush() except + - -cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource): - struct counter: - counter() - - int64_t value - int64_t peak - int64_t total - - statistics_resource_adaptor(Upstream* upstream_mr) except + - - counter get_bytes_counter() except + - counter get_allocations_counter() except + - pair[counter, counter] pop_counters() except + - pair[counter, counter] push_counters() except + - -cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource): - tracking_resource_adaptor( - Upstream* upstream_mr, - bool capture_stacks) except + - - size_t get_allocated_bytes() except + - string get_outstanding_allocations_str() except + - void log_outstanding_allocations() except + - -cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - ctypedef bool (*failure_callback_t)(size_t, void*) - cdef cppclass failure_callback_resource_adaptor[Upstream]( - device_memory_resource - ): - failure_callback_resource_adaptor( - Upstream* upstream_mr, - failure_callback_t callback, - void* callback_arg - ) except + - -cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ - namespace "rmm::mr" nogil: - cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource): - prefetch_resource_adaptor(Upstream* upstream_mr) except + - - cdef class DeviceMemoryResource: + # TODO this will be removed once the C++ base class is removed + # it is only used for passing upstream resources to C++, which will + # become resource_ref (type erased) cdef device_memory_resource* get_mr(self) noexcept nogil: - """Get the underlying C++ memory resource object.""" - return self.c_obj.get() + pass + + cdef shared_ptr[device_async_resource_ref] get_ref(self) noexcept nogil: + pass def allocate(self, size_t nbytes, Stream stream=DEFAULT_STREAM): """Allocate ``nbytes`` bytes of memory. @@ -251,7 +105,11 @@ cdef class DeviceMemoryResource: stream : Stream Optional stream for the allocation """ - return self.c_obj.get().allocate(nbytes, stream.view()) + return deref(self.get_ref()).allocate_async( + nbytes, + CUDA_ALLOCATION_ALIGNMENT, + stream.ref() + ) def deallocate(self, uintptr_t ptr, size_t nbytes, Stream stream=DEFAULT_STREAM): """Deallocate memory pointed to by ``ptr`` of size ``nbytes``. @@ -265,7 +123,12 @@ cdef class DeviceMemoryResource: stream : Stream Optional stream for the deallocation """ - self.c_obj.get().deallocate((ptr), nbytes, stream.view()) + deref(self.get_ref()).deallocate_async( + (ptr), + nbytes, + CUDA_ALLOCATION_ALIGNMENT, + stream.ref() + ) # See the note about `no_gc_clear` in `device_buffer.pyx`. @@ -285,10 +148,6 @@ cdef class UpstreamResourceAdaptor(DeviceMemoryResource): self.upstream_mr = upstream_mr - def __dealloc__(self): - # Must cleanup the base MR before any upstream MR - self.c_obj.reset() - cpdef DeviceMemoryResource get_upstream(self): return self.upstream_mr @@ -435,7 +294,7 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor): ) self.c_obj.reset( new pool_memory_resource[device_memory_resource]( - upstream_mr.get_mr(), + deref(upstream_mr.get_ref()), c_initial_pool_size, c_maximum_pool_size ) @@ -465,10 +324,7 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor): pass def pool_size(self): - cdef pool_memory_resource[device_memory_resource]* c_mr = ( - (self.get_mr()) - ) - return c_mr.pool_size() + return self.c_obj.get().pool_size() cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor): def __cinit__( @@ -1169,12 +1025,7 @@ cpdef set_per_device_resource(int device, DeviceMemoryResource mr): global _per_device_mrs _per_device_mrs[device] = mr - # Since cuda_device_id does not have a default constructor, it must be heap - # allocated - cdef unique_ptr[cuda_device_id] device_id = \ - make_unique[cuda_device_id](device) - - cpp_set_per_device_resource(deref(device_id), mr.get_mr()) + cpp_set_per_device_resource(cuda_device_id(device), deref(mr.get_ref())) cpdef set_current_device_resource(DeviceMemoryResource mr): diff --git a/python/rmm/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd index c33217622..7b9370006 100644 --- a/python/rmm/rmm/_lib/per_device_resource.pxd +++ b/python/rmm/rmm/_lib/per_device_resource.pxd @@ -1,23 +1,31 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from rmm._lib.memory_resource cimport device_memory_resource +from rmm._lib.resource_ref cimport device_async_resource_ref -cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: +cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil: cdef cppclass cuda_device_id: ctypedef int value_type - + cuda_device_id() cuda_device_id(value_type id) - value_type value() cdef extern from "rmm/mr/device/per_device_resource.hpp" \ namespace "rmm::mr" nogil: - cdef device_memory_resource* set_current_device_resource( - device_memory_resource* new_mr - ) - cdef device_memory_resource* get_current_device_resource() - cdef device_memory_resource* set_per_device_resource( - cuda_device_id id, device_memory_resource* new_mr - ) - cdef device_memory_resource* get_per_device_resource ( - cuda_device_id id + cdef device_async_resource_ref set_per_device_resource( + cuda_device_id id, + device_async_resource_ref new_mr ) diff --git a/python/rmm/rmm/_lib/resource_ref.pxd b/python/rmm/rmm/_lib/resource_ref.pxd new file mode 100644 index 000000000..1c2c94325 --- /dev/null +++ b/python/rmm/rmm/_lib/resource_ref.pxd @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cuda.ccudart cimport cudaStream_t + + +cdef extern from "cuda/memory_resource" namespace "cuda" nogil: + cdef cppclass stream_ref: + stream_ref() except + + stream_ref(cudaStream_t stream) except + + +cdef extern from "rmm/aligned.hpp" namespace "rmm" nogil: + cdef size_t CUDA_ALLOCATION_ALIGNMENT + +cdef extern from "rmm/resource_ref.hpp" namespace "rmm" nogil: + cdef cppclass device_async_resource_ref: + void* allocate(size_t bytes, size_t alignment) except + + void deallocate(void* ptr, size_t bytes, size_t alignment) except + + void* allocate_async( + size_t bytes, + size_t alignment, + stream_ref stream) except + + void deallocate_async( + void* ptr, + size_t bytes, + size_t alignment, + stream_ref stream + ) except + diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 75b15a90b..b65c39060 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -41,6 +41,8 @@ function(ConfigureTestInternal TEST_NAME) target_compile_options(${TEST_NAME} PUBLIC $<$:-Wall -Werror -Wno-error=deprecated-declarations>) + target_compile_options(${TEST_NAME} PRIVATE "$<$:-O0>") + if(DISABLE_DEPRECATION_WARNING) target_compile_options( ${TEST_NAME} PUBLIC $<$:-Xcompiler=-Wno-deprecated-declarations>) diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu index e58ba53a2..7413dd2dc 100644 --- a/tests/container_multidevice_tests.cu +++ b/tests/container_multidevice_tests.cu @@ -42,9 +42,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource(check_mr); { if constexpr (std::is_same_v>) { @@ -69,7 +69,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource(); auto check_mr = device_check_resource_adaptor{orig_mr}; rmm::mr::set_current_device_resource(&check_mr); @@ -109,7 +109,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource(); auto check_mr = device_check_resource_adaptor{orig_mr}; rmm::mr::set_current_device_resource(&check_mr); @@ -132,7 +132,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource(); auto check_mr = device_check_resource_adaptor{orig_mr}; rmm::mr::set_current_device_resource(&check_mr); diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp index 1cc068434..ec7e6c3e9 100644 --- a/tests/cuda_stream_tests.cpp +++ b/tests/cuda_stream_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,6 +96,6 @@ TEST_F(CudaStreamDeathTest, TestSyncNoThrow) // should assert here or in `~cuda_stream()` stream_a.synchronize_no_throw(); }; - EXPECT_DEATH(test(), "Assertion"); + EXPECT_DEATH(test(), ""); } #endif diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp index fcb578fdf..c77c078db 100644 --- a/tests/device_check_resource_adaptor.hpp +++ b/tests/device_check_resource_adaptor.hpp @@ -17,13 +17,14 @@ #include #include #include +#include #include #include class device_check_resource_adaptor final : public rmm::mr::device_memory_resource { public: - device_check_resource_adaptor(rmm::mr::device_memory_resource* upstream) + device_check_resource_adaptor(rmm::device_async_resource_ref upstream) : device_id{rmm::get_current_cuda_device()}, upstream_(upstream) { } @@ -36,11 +37,6 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour return upstream_; } - /** - * @briefreturn{device_memory_resource* to the upstream memory resource} - */ - [[nodiscard]] device_memory_resource* get_upstream() const noexcept { return upstream_; } - private: [[nodiscard]] bool check_device_id() const { return device_id == rmm::get_current_cuda_device(); } @@ -48,7 +44,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { return upstream_->allocate(bytes, stream); } + if (is_correct_device) { return get_upstream_resource().allocate_async(bytes, stream); } return nullptr; } @@ -56,7 +52,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { upstream_->deallocate(ptr, bytes, stream); } + if (is_correct_device) { get_upstream_resource().deallocate_async(ptr, bytes, stream); } } [[nodiscard]] bool do_is_equal( @@ -64,10 +60,10 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { if (this == &other) { return true; } auto const* cast = dynamic_cast(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } rmm::cuda_device_id device_id; - rmm::mr::device_memory_resource* upstream_{}; + rmm::device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; }; diff --git a/tests/mock_resource.hpp b/tests/mock_resource.hpp index e06148d3a..555cf0d74 100644 --- a/tests/mock_resource.hpp +++ b/tests/mock_resource.hpp @@ -25,7 +25,12 @@ class mock_resource : public rmm::mr::device_memory_resource { public: MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override)); MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override)); + bool operator==(mock_resource const&) const noexcept { return true; } + bool operator!=(mock_resource const&) const { return false; } + friend void get_property(mock_resource const&, cuda::mr::device_accessible) noexcept {} using size_pair = std::pair; }; +static_assert(cuda::mr::async_resource_with); + } // namespace rmm::test diff --git a/tests/mr/device/aligned_mr_tests.cpp b/tests/mr/device/aligned_mr_tests.cpp index b9ecbc8ca..85262c29d 100644 --- a/tests/mr/device/aligned_mr_tests.cpp +++ b/tests/mr/device/aligned_mr_tests.cpp @@ -59,13 +59,13 @@ TEST(AlignedTest, ThrowOnInvalidAllocationAlignment) TEST(AlignedTest, SupportsGetMemInfo) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; } TEST(AlignedTest, DefaultAllocationAlignmentPassthrough) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; cuda_stream_view stream; void* const pointer = int_to_address(123); diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 1eb38888e..6b7468d74 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -15,6 +15,7 @@ */ #include "../../byte_literals.hpp" +#include "cuda/stream_ref" #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -37,15 +39,22 @@ namespace { class mock_memory_resource { public: - MOCK_METHOD(void*, allocate, (std::size_t)); - MOCK_METHOD(void, deallocate, (void*, std::size_t)); + MOCK_METHOD(void*, allocate, (std::size_t, std::size_t)); + MOCK_METHOD(void, deallocate, (void*, std::size_t, std::size_t)); + MOCK_METHOD(void*, allocate_async, (std::size_t, std::size_t, cuda::stream_ref)); + MOCK_METHOD(void, deallocate_async, (void*, std::size_t, std::size_t, cuda::stream_ref)); + bool operator==(mock_memory_resource const&) const noexcept { return true; } + bool operator!=(mock_memory_resource const&) const { return false; } + friend void get_property(mock_memory_resource const&, cuda::mr::device_accessible) noexcept {} }; +static_assert(cuda::mr::async_resource_with); + using rmm::mr::detail::arena::block; using rmm::mr::detail::arena::byte_span; using rmm::mr::detail::arena::superblock; -using global_arena = rmm::mr::detail::arena::global_arena; -using arena = rmm::mr::detail::arena::arena; +using global_arena = rmm::mr::detail::arena::global_arena; +using arena = rmm::mr::detail::arena::arena; using arena_mr = rmm::mr::arena_memory_resource; using ::testing::Return; @@ -59,9 +68,10 @@ auto const fake_address4 = reinterpret_cast(superblock::minimum_size * 2) struct ArenaTest : public ::testing::Test { void SetUp() override { - EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3)); - EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size)); - global = std::make_unique(&mock_mr, arena_size); + EXPECT_CALL(mock_mr, allocate(arena_size, ::testing::_)).WillOnce(Return(fake_address3)); + EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size, ::testing::_)); + + global = std::make_unique(mock_mr, arena_size); per_thread = std::make_unique(*global); } @@ -293,13 +303,6 @@ TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull) // NOLINT /** * Test global_arena. */ - -TEST_F(ArenaTest, GlobalArenaNullUpstream) // NOLINT -{ - auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; }; - EXPECT_THROW(construct_nullptr(), rmm::logic_error); // NOLINT(cppcoreguidelines-avoid-goto) -} - TEST_F(ArenaTest, GlobalArenaAcquire) // NOLINT { auto const sblk = global->acquire(256); @@ -378,7 +381,7 @@ TEST_F(ArenaTest, GlobalArenaDeallocate) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); - global->deallocate(ptr, superblock::minimum_size * 2, {}); + global->deallocate_async(ptr, superblock::minimum_size * 2, {}); ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); } @@ -387,8 +390,8 @@ TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size + 256); auto* ptr2 = global->allocate(superblock::minimum_size + 512); - global->deallocate(ptr, superblock::minimum_size + 256, {}); - global->deallocate(ptr2, superblock::minimum_size + 512, {}); + global->deallocate_async(ptr, superblock::minimum_size + 256, {}); + global->deallocate_async(ptr2, superblock::minimum_size + 512, {}); EXPECT_EQ(global->allocate(arena_size), fake_address3); } diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp index 34a2cc8cc..0b66e925b 100644 --- a/tests/mr/device/callback_mr_tests.cpp +++ b/tests/mr/device/callback_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,22 +36,24 @@ using ::testing::_; TEST(CallbackTest, TestCallbacksAreInvoked) { - auto base_mr = mock_resource(); + auto base_mr = mock_resource(); + auto base_ref = device_async_resource_ref{base_mr}; EXPECT_CALL(base_mr, do_allocate(10_MiB, cuda_stream_view{})).Times(1); EXPECT_CALL(base_mr, do_deallocate(_, 10_MiB, cuda_stream_view{})).Times(1); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_ref, &base_ref); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); } TEST(CallbackTest, LoggingTest) @@ -61,22 +63,23 @@ TEST(CallbackTest, LoggingTest) auto base_mr = rmm::mr::get_current_device_resource(); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Allocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Deallocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, base_mr, base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); std::string output = testing::internal::GetCapturedStdout(); - std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", 10_MiB, 10_MiB); + std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size); ASSERT_EQ(expect, output); } diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp index 777ce9428..5ef914dd4 100644 --- a/tests/mr/device/limiting_mr_tests.cpp +++ b/tests/mr/device/limiting_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,19 +25,19 @@ namespace rmm::test { namespace { -using Limiting_adaptor = rmm::mr::limiting_resource_adaptor; +using limiting_adaptor = rmm::mr::limiting_resource_adaptor; TEST(LimitingTest, ThrowOnNullUpstream) { auto const max_size{5_MiB}; - auto construct_nullptr = []() { Limiting_adaptor mr{nullptr, max_size}; }; + auto construct_nullptr = []() { limiting_adaptor mr{nullptr, max_size}; }; EXPECT_THROW(construct_nullptr(), rmm::logic_error); } TEST(LimitingTest, TooBig) { auto const max_size{5_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; EXPECT_THROW(mr.allocate(max_size + 1), rmm::out_of_memory); } @@ -45,15 +45,15 @@ TEST(LimitingTest, UpstreamFailure) { auto const max_size_1{2_MiB}; auto const max_size_2{5_MiB}; - Limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; - Limiting_adaptor mr2{&mr1, max_size_2}; + limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; + limiting_adaptor mr2{&mr1, max_size_2}; EXPECT_THROW(mr2.allocate(4_MiB), rmm::out_of_memory); } TEST(LimitingTest, UnderLimitDueToFrees) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; @@ -81,7 +81,7 @@ TEST(LimitingTest, UnderLimitDueToFrees) TEST(LimitingTest, OverLimit) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp index 3ba32445f..2a7c2904c 100644 --- a/tests/mr/device/mr_ref_multithreaded_tests.cpp +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -69,42 +69,40 @@ void spawn(Task task, Arguments&&... args) TEST(DefaultTest, UseCurrentDeviceResource_mt) { spawn(test_get_current_device_resource); } -TEST(DefaultTest, CurrentDeviceResourceIsCUDA_mt) -{ - spawn([]() { - EXPECT_NE(nullptr, rmm::mr::get_current_device_resource()); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal(rmm::mr::cuda_memory_resource{})); - }); -} - TEST(DefaultTest, GetCurrentDeviceResource_mt) { spawn([]() { - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); - EXPECT_NE(nullptr, mr); - EXPECT_TRUE(mr->is_equal(rmm::mr::cuda_memory_resource{})); + EXPECT_EQ(rmm::mr::get_current_device_resource(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); }); } // Disable until we support resource_ref with set_current_device_resource -/*TEST_P(mr_ref_test_mt, SetCurrentDeviceResource_mt) +TEST_P(mr_ref_test_mt, SetCurrentDeviceResource_mt) { // single thread changes default resource, then multiple threads use it + rmm::mr::cuda_memory_resource cuda_mr{}; + auto cuda_ref = rmm::device_async_resource_ref{cuda_mr}; - rmm::mr::device_memory_resource* old = rmm::mr::set_current_device_resource(this->mr.get()); - EXPECT_NE(nullptr, old); + rmm::mr::set_current_device_resource(cuda_ref); + auto old = rmm::mr::set_current_device_resource(this->ref); - spawn([mr = this->mr.get()]() { + spawn([mr = this->ref]() { EXPECT_EQ(mr, rmm::mr::get_current_device_resource()); test_get_current_device_resource(); // test allocating with the new default resource }); - // setting default resource w/ nullptr should reset to initial - rmm::mr::set_current_device_resource(nullptr); - EXPECT_TRUE(old->is_equal(*rmm::mr::get_current_device_resource())); -}*/ + // old mr should equal a cuda mr + EXPECT_EQ(old, cuda_ref); + + // setting to `nullptr` should reset to initial cuda resource + // Resetting should reset to initial cuda resource + rmm::mr::reset_current_device_resource(); + EXPECT_EQ(rmm::mr::get_current_device_resource(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); +} -/*TEST_P(mr_ref_test_mt, SetCurrentDeviceResourcePerThread_mt) +TEST_P(mr_ref_test_mt, SetCurrentDeviceResourcePerThread_mt) { int num_devices{}; RMM_CUDA_TRY(cudaGetDeviceCount(&num_devices)); @@ -112,31 +110,29 @@ TEST(DefaultTest, GetCurrentDeviceResource_mt) std::vector threads; threads.reserve(num_devices); for (int i = 0; i < num_devices; ++i) { - threads.emplace_back(std::thread{[mr = this->mr.get()](auto dev_id) { - RMM_CUDA_TRY(cudaSetDevice(dev_id)); - rmm::mr::device_memory_resource* old = - rmm::mr::set_current_device_resource(mr); - EXPECT_NE(nullptr, old); - // initial resource for this device should be CUDA mr - EXPECT_TRUE(old->is_equal(rmm::mr::cuda_memory_resource{})); - // get_current_device_resource should equal the resource we - // just set - EXPECT_EQ(mr, rmm::mr::get_current_device_resource()); - // Setting current dev resource to nullptr should reset to - // cuda MR and return the MR we previously set - old = rmm::mr::set_current_device_resource(nullptr); - EXPECT_NE(nullptr, old); - EXPECT_EQ(old, mr); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal( - rmm::mr::cuda_memory_resource{})); - }, - i}); + threads.emplace_back( + [mr = this->ref](auto dev_id) { + rmm::mr::cuda_memory_resource cuda_mr{}; + auto cuda_ref = rmm::device_async_resource_ref{cuda_mr}; + + rmm::mr::set_current_device_resource(cuda_ref); + auto old = rmm::mr::set_current_device_resource(mr); + // old mr should equal a cuda mr + EXPECT_EQ(old, cuda_ref); + // current dev resource should equal this resource + EXPECT_EQ(mr, rmm::mr::get_current_device_resource()); + // Resetting should reset to initial cuda resource + rmm::mr::reset_current_device_resource(); + EXPECT_EQ(rmm::mr::get_current_device_resource(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); + }, + i); } for (auto& thread : threads) { thread.join(); } -}*/ +} TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); } diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp index df0045d2b..8398da4c4 100644 --- a/tests/mr/device/mr_ref_test.hpp +++ b/tests/mr/device/mr_ref_test.hpp @@ -65,12 +65,11 @@ struct allocation { inline void test_get_current_device_resource() { - EXPECT_NE(nullptr, rmm::mr::get_current_device_resource()); - void* ptr = rmm::mr::get_current_device_resource()->allocate(1_MiB); + void* ptr = rmm::mr::get_current_device_resource().allocate(1_MiB); EXPECT_NE(nullptr, ptr); EXPECT_TRUE(is_properly_aligned(ptr)); EXPECT_TRUE(is_device_accessible_memory(ptr)); - rmm::mr::get_current_device_resource()->deallocate(ptr, 1_MiB); + rmm::mr::get_current_device_resource().deallocate(ptr, 1_MiB); } inline void test_allocate(resource_ref ref, std::size_t bytes) diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp index d94817bef..c7e49d84b 100644 --- a/tests/mr/device/mr_ref_tests.cpp +++ b/tests/mr/device/mr_ref_tests.cpp @@ -55,40 +55,36 @@ INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests, "Binning"), [](auto const& info) { return info.param; }); -TEST(DefaultTest, CurrentDeviceResourceIsCUDA) -{ - EXPECT_NE(nullptr, rmm::mr::get_current_device_resource()); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal(rmm::mr::cuda_memory_resource{})); -} - TEST(DefaultTest, UseCurrentDeviceResource) { test_get_current_device_resource(); } TEST(DefaultTest, GetCurrentDeviceResource) { - auto* mr = rmm::mr::get_current_device_resource(); - EXPECT_NE(nullptr, mr); - EXPECT_TRUE(mr->is_equal(rmm::mr::cuda_memory_resource{})); + EXPECT_EQ(rmm::mr::get_current_device_resource(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); } // Disable until we support resource_ref with set_current_device_resource -/*TEST_P(mr_ref_test, SetCurrentDeviceResource) +TEST_P(mr_ref_test, SetCurrentDeviceResource) { - rmm::mr::device_memory_resource* old{}; - old = rmm::mr::set_current_device_resource(this->mr.get()); - EXPECT_NE(nullptr, old); + rmm::mr::cuda_memory_resource cuda_mr{}; + auto cuda_ref = rmm::device_async_resource_ref{cuda_mr}; + + rmm::mr::set_current_device_resource(cuda_ref); + auto old = rmm::mr::set_current_device_resource(this->ref); // old mr should equal a cuda mr - EXPECT_TRUE(old->is_equal(rmm::mr::cuda_memory_resource{})); + EXPECT_EQ(old, cuda_ref); // current dev resource should equal this resource - EXPECT_TRUE(this->mr->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(this->ref, rmm::mr::get_current_device_resource()); test_get_current_device_resource(); - // setting to `nullptr` should reset to initial cuda resource - rmm::mr::set_current_device_resource(nullptr); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal(rmm::mr::cuda_memory_resource{})); -}*/ + // Resetting should reset to initial cuda resource + rmm::mr::reset_current_device_resource(); + EXPECT_EQ(rmm::mr::get_current_device_resource(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); +} TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index c63a61844..d82464735 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -61,7 +61,7 @@ TEST(PoolTest, ReferenceThrowMaxLessThanInitial) auto max_less_than_initial = []() { const auto initial{1024}; const auto maximum{256}; - pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + pool_mr mr{rmm::mr::get_current_device_resource(), initial, maximum}; }; EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp index 6c5700f0b..c88a2ddca 100644 --- a/tests/mr/device/statistics_mr_tests.cpp +++ b/tests/mr/device/statistics_mr_tests.cpp @@ -127,7 +127,7 @@ TEST(StatisticsTest, PeakAllocations) TEST(StatisticsTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource(); statistics_adaptor mr{orig_device_resource}; rmm::mr::set_current_device_resource(&mr); diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp index 7c2532c60..a2d0e41e7 100644 --- a/tests/mr/device/tracking_mr_tests.cpp +++ b/tests/mr/device/tracking_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,7 +101,7 @@ TEST(TrackingTest, AllocationsLeftWithoutStacks) TEST(TrackingTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource(); tracking_adaptor mr{orig_device_resource, true}; rmm::mr::set_current_device_resource(&mr); diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 8445ab1f5..071739575 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -233,14 +233,17 @@ TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; alignment *= TestedAlignmentMultiplier) { +#ifdef NDEBUG auto allocation_size = size_distribution(generator); void* ptr{nullptr}; // An unsupported alignment (like an odd number) should result in an // alignment of `alignof(std::max_align_t)` auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); +#endif } } }