Skip to content

Commit

Permalink
[EM] Compress dense ellpack.
Browse files Browse the repository at this point in the history
This helps reduce the memory copying needed for dense data.

- Compress the dense ellpack pages by making the number of symbols constant to the number
of features.
- Avoid fetching data in the data source during `at_end_`.
- Cleanup and optimizations memory ops.
  • Loading branch information
trivialfis committed Sep 18, 2024
1 parent 96bbf80 commit 0450a27
Show file tree
Hide file tree
Showing 22 changed files with 359 additions and 208 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ipch
*.filters
*.user
*log
rmm_log.txt
Debug
*suo
.Rhistory
Expand Down
23 changes: 12 additions & 11 deletions src/common/cuda_pinned_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <new> // for bad_array_new_length

#include "common.h"

Expand All @@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -78,14 +79,14 @@ struct SamAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

size_type n_bytes = cnt * sizeof(value_type);
pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
Expand Down Expand Up @@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
};

template <typename T>
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;

template <typename T>
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;

template <typename T>
using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
Expand Down
19 changes: 9 additions & 10 deletions src/data/device_adapter.cuh
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
* \file device_adapter.cuh
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of

#include <cstddef> // for size_t
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>

#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh"
#include "../common/math.h"
#include "adapter.h"
#include "array_interface.h"

Expand Down Expand Up @@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

// Returns maximum row length
template <typename AdapterBatchT>
bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
float missing) {
bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
dh::safe_cuda(
cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));

auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
Expand All @@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
}

// Count elements per row
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
bst_idx_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
Expand All @@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
bst_idx_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
return row_stride;
Expand Down
Loading

0 comments on commit 0450a27

Please sign in to comment.