[EM] Compress dense ellpack.

This helps reduce the memory copying needed for dense data. - Compress the dense ellpack pages by making the number of symbols constant to the number of features. - Avoid fetching data in the data source during `at_end_`. - Cleanup and optimizations memory ops.
dmlc · Sep 18, 2024 · 0450a27 · 0450a27
1 parent 96bbf80
commit 0450a27
Show file tree

Hide file tree

Showing 22 changed files with 359 additions and 208 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,7 @@ ipch
 *.filters
 *.user
 *log
+rmm_log.txt
 Debug
 *suo
 .Rhistory

diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
@@ -10,6 +10,7 @@
 
 #include <cstddef>  // for size_t
 #include <limits>   // for numeric_limits
+#include <new>      // for bad_array_new_length
 
 #include "common.h"
 
@@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -78,14 +79,14 @@ struct SamAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     size_type n_bytes = cnt * sizeof(value_type);
     pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
@@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
 };
 
 template <typename T>
-using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
+using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;
 
 template <typename T>
-using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
+using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;
 
 template <typename T>
 using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;

diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
@@ -1,19 +1,18 @@
 /**
- *  Copyright 2019-2023 by XGBoost Contributors
+ *  Copyright 2019-2024, XGBoost Contributors
  * \file device_adapter.cuh
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/logical.h>                     // for none_of
 
-#include <cstddef>                              // for size_t
+#include <cstddef>  // for size_t
 #include <limits>
-#include <memory>
 #include <string>
 
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
-#include "../common/math.h"
 #include "adapter.h"
 #include "array_interface.h"
 
@@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
-                       float missing) {
+bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
+                       common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
-  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+  dh::safe_cuda(
+      cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));
 
   auto n_samples = batch.NumRows();
   bst_feature_t n_features = batch.NumCols();
@@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
   }
 
   // Count elements per row
-  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+  dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
     bst_idx_t cnt{0};
     auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
     SPAN_CHECK(ridx < n_samples);
@@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
                   &offset[ridx]),
               static_cast<unsigned long long>(cnt));  // NOLINT
   });
-  dh::XGBCachingDeviceAllocator<char> alloc;
   bst_idx_t row_stride =
-      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+      dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
                  thrust::device_pointer_cast(offset.data()) + offset.size(),
                  static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
   return row_stride;