dmlc · trivialfis · Jan 3, 2023 · Dec 16, 2022 · Dec 16, 2022 · Dec 16, 2022
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
@@ -55,6 +55,7 @@ OBJECTS= \
     $(PKGROOT)/src/predictor/cpu_predictor.o \
     $(PKGROOT)/src/tree/constraints.o \
     $(PKGROOT)/src/tree/param.o \
+    $(PKGROOT)/src/tree/fit_stump.o \
     $(PKGROOT)/src/tree/tree_model.o \
     $(PKGROOT)/src/tree/tree_updater.o \
     $(PKGROOT)/src/tree/updater_approx.o \
@@ -85,6 +86,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/pseudo_huber.o \
     $(PKGROOT)/src/common/quantile.o \
     $(PKGROOT)/src/common/random.o \
+    $(PKGROOT)/src/common/stats.o \
     $(PKGROOT)/src/common/survival_util.o \
     $(PKGROOT)/src/common/threading_utils.o \
     $(PKGROOT)/src/common/timer.o \

diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
@@ -55,6 +55,7 @@ OBJECTS= \
     $(PKGROOT)/src/predictor/cpu_predictor.o \
     $(PKGROOT)/src/tree/constraints.o \
     $(PKGROOT)/src/tree/param.o \
+    $(PKGROOT)/src/tree/fit_stump.o \
     $(PKGROOT)/src/tree/tree_model.o \
     $(PKGROOT)/src/tree/tree_updater.o \
     $(PKGROOT)/src/tree/updater_approx.o \
@@ -85,6 +86,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/pseudo_huber.o \
     $(PKGROOT)/src/common/quantile.o \
     $(PKGROOT)/src/common/random.o \
+    $(PKGROOT)/src/common/stats.o \
     $(PKGROOT)/src/common/survival_util.o \
     $(PKGROOT)/src/common/threading_utils.o \
     $(PKGROOT)/src/common/timer.o \

diff --git a/include/xgboost/base.h b/include/xgboost/base.h
@@ -134,6 +134,8 @@ using bst_row_t = std::size_t;   // NOLINT
 using bst_node_t = int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
 using bst_group_t = uint32_t;    // NOLINT
+/*! \brief Type for indexing target variables. */
+using bst_target_t = std::size_t;  // NOLINT
 
 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cinttypes>  // std::int32_t
 #include <limits>
 #include <string>
 #include <tuple>
@@ -388,9 +389,9 @@ class TensorView {
    * \brief Create a tensor with data, shape and strides.  Don't use this constructor if
    *        stride can be calculated from shape.
    */
-  template <typename I, int32_t D>
+  template <typename I, std::int32_t D>
   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
-                       int32_t device)
+                       std::int32_t device)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D == kDim, "Invalid shape & stride.");
     detail::UnrollLoop<D>([&](auto i) {
@@ -833,6 +834,27 @@ class Tensor {
   int32_t DeviceIdx() const { return data_.DeviceIdx(); }
 };
 
+template <typename T>
+using Vector = Tensor<T, 1>;
+
+template <typename T, typename... Index>
+auto Constant(Context const *ctx, T v, Index &&...index) {
+  Tensor<T, sizeof...(Index)> t;
+  t.SetDevice(ctx->gpu_id);
+  t.Reshape(index...);
+  t.Data()->Fill(std::move(v));
+  return t;
+}
+
+
+/**
+ * \brief Like `np.zeros`, return a new array of given shape and type, filled with zeros.
+ */
+template <typename T, typename... Index>
+auto Zeros(Context const *ctx, Index &&...index) {
+  return Constant(ctx, static_cast<T>(0), index...);
+}
+
 // Only first axis is supported for now.
 template <typename T, int32_t D>
 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {

diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
@@ -93,7 +93,7 @@ class ObjFunction : public Configurable {
    * \brief Return number of targets for input matrix.  Right now XGBoost supports only
    *        multi-target regression.
    */
-  virtual uint32_t Targets(MetaInfo const& info) const {
+  virtual bst_target_t Targets(MetaInfo const& info) const {
     if (info.labels.Shape(1) > 1) {
       LOG(FATAL) << "multioutput is not supported by current objective function";
     }

diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
@@ -172,6 +172,7 @@ void HostDeviceVector<T>::SetDevice(int) const {}
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
+template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<FeatureType>;

diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
@@ -404,6 +404,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
+template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<FeatureType>;

diff --git a/src/common/numeric.cc b/src/common/numeric.cc
@@ -3,10 +3,8 @@
  */
 #include "numeric.h"
 
-#include <numeric>      // std::accumulate
 #include <type_traits>  // std::is_same
 
-#include "threading_utils.h"             // MemStackAllocator, ParallelFor, DefaultMaxThreads
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
@@ -15,14 +13,11 @@ namespace common {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
   if (ctx->IsCPU()) {
     auto const& h_values = values.ConstHostVector();
-    MemStackAllocator<double, DefaultMaxThreads()> result_tloc(ctx->Threads(), 0);
-    ParallelFor(h_values.size(), ctx->Threads(),
-                [&](auto i) { result_tloc[omp_get_thread_num()] += h_values[i]; });
-    auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cend(), 0.0);
+    auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
     static_assert(std::is_same<decltype(result), double>::value, "");
     return result;
   }
-  return cuda::Reduce(ctx, values);
+  return cuda_impl::Reduce(ctx, values);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/numeric.cu b/src/common/numeric.cu
@@ -2,24 +2,22 @@
  * Copyright 2022 by XGBoost Contributors
  */
 #include <thrust/execution_policy.h>
-#include <thrust/functional.h>  // thrust:plus
 
-#include "device_helpers.cuh"  // dh::Reduce, safe_cuda, dh::XGBCachingDeviceAllocator
+#include "device_helpers.cuh"            // dh::Reduce, dh::XGBCachingDeviceAllocator
 #include "numeric.h"
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
 namespace xgboost {
 namespace common {
-namespace cuda {
+namespace cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
   values.SetDevice(ctx->gpu_id);
   auto const d_values = values.ConstDeviceSpan();
   dh::XGBCachingDeviceAllocator<char> alloc;
-  auto res = dh::Reduce(thrust::cuda::par(alloc), d_values.data(),
-                        d_values.data() + d_values.size(), 0.0, thrust::plus<double>{});
-  return res;
+  return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
+                    thrust::plus<float>{});
 }
-}  // namespace cuda
+}  // namespace cuda_impl
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/numeric.h b/src/common/numeric.h
@@ -95,17 +95,33 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
   exc.Rethrow();
 }
 
-namespace cuda {
+namespace cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
 #if !defined(XGBOOST_USE_CUDA)
 inline double Reduce(Context const*, HostDeviceVector<float> const&) {
   AssertGPUSupport();
   return 0;
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace cuda
+}  // namespace cuda_impl
+
+/**
+ * \brief Reduction with iterator. init must be additive identity. (0 for primitive types)
+ */
+namespace cpu_impl {
+template <typename It, typename V = typename It::value_type>
+V Reduce(Context const* ctx, It first, It second, V const& init) {
+  size_t n = std::distance(first, second);
+  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
+  common::ParallelFor(n, ctx->Threads(),
+                      [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
+  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
+  return result;
+}
+}  // namespace cpu_impl
+
 /**
- * \brief Reduction with summation.
+ * \brief Reduction on host device vector.
  */
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values);
 

diff --git a/src/common/quantile.cu b/src/common/quantile.cu
@@ -641,7 +641,7 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
                           thrust::equal_to<bst_feature_t>{},
                           [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
     dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
-    auto max_it = common::MakeIndexTransformIter([&](auto i) {
+    auto max_it = MakeIndexTransformIter([&](auto i) {
       if (IsCat(h_feature_types, i)) {
         return max_values[i].value;
       }

diff --git a/src/common/stats.cc b/src/common/stats.cc
@@ -0,0 +1,64 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include "stats.h"
+
+#include <numeric>                       // std::accumulate
+
+#include "common.h"                      // OptionalWeights
+#include "threading_utils.h"             // ParallelFor, MemStackAllocator
+#include "transform_iterator.h"          // MakeIndexTransformIter
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
+#include "xgboost/logging.h"             // CHECK_EQ
+
+namespace xgboost {
+namespace common {
+float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
+             HostDeviceVector<float> const& weights) {
+  CHECK_LE(t.Shape(1), 1) << "Matrix is not yet supported.";
+  if (!ctx->IsCPU()) {
+    weights.SetDevice(ctx->gpu_id);
+    auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
+    auto t_v = t.View(ctx->gpu_id);
+    return cuda_impl::Median(ctx, t_v, opt_weights);
+  }
+
+  auto opt_weights = OptionalWeights(weights.ConstHostSpan());
+  auto t_v = t.HostView();
+  auto iter = common::MakeIndexTransformIter(
+      [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); });
+  float q{0};
+  if (opt_weights.Empty()) {
+    q = common::Quantile(0.5, iter, iter + t_v.Size());
+  } else {
+    CHECK_NE(t_v.Shape(1), 0);
+    auto w_it = common::MakeIndexTransformIter([&](size_t i) {
+      auto sample_idx = i / t_v.Shape(1);
+      return opt_weights[sample_idx];
+    });
+    q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it);
+  }
+  return q;
+}
+
+void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
+  v.SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->gpu_id);
+  out->Reshape(1);
+
+  if (ctx->IsCPU()) {
+    auto h_v = v.HostView();
+    float n = v.Size();
+    MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
+    ParallelFor(v.Size(), ctx->Threads(),
+                [&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
+    auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
+    out->HostView()(0) = ret;
+  } else {
+    cuda_impl::Mean(ctx, v.View(ctx->gpu_id), out->View(ctx->gpu_id));
+  }
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/src/common/stats.cu b/src/common/stats.cu
@@ -13,7 +13,7 @@
 
 namespace xgboost {
 namespace common {
-namespace cuda {
+namespace cuda_impl {
 float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
              common::OptionalWeights weights) {
   HostDeviceVector<size_t> segments{0, t.Size()};
@@ -42,6 +42,17 @@ float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
   CHECK_EQ(quantile.Size(), 1);
   return quantile.HostVector().front();
 }
-}  // namespace cuda
+
+void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out) {
+  float n = v.Size();
+  auto it = dh::MakeTransformIterator<float>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v(i) / n; });
+  std::size_t bytes;
+  CHECK_EQ(out.Size(), 1);
+  cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size());
+  dh::TemporaryArray<char> temp{bytes};
+  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size());
+}
+}  // namespace cuda_impl
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/stats.h b/src/common/stats.h
@@ -8,10 +8,11 @@
 #include <limits>
 #include <vector>
 
-#include "common.h"              // AssertGPUSupport
+#include "common.h"              // AssertGPUSupport, OptionalWeights
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/context.h"     // Context
 #include "xgboost/linalg.h"
+#include "xgboost/logging.h"  // CHECK_GE
 
 namespace xgboost {
 namespace common {
@@ -93,43 +94,25 @@ float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
   return val(idx);
 }
 
-namespace cuda {
-float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
-             common::OptionalWeights weights);
+namespace cuda_impl {
+float Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWeights weights);
+void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);
+
 #if !defined(XGBOOST_USE_CUDA)
-inline float Median(Context const*, linalg::TensorView<float const, 2>, common::OptionalWeights) {
-  AssertGPUSupport();
+inline float Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights) {
+  common::AssertGPUSupport();
   return 0;
 }
+inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
+  common::AssertGPUSupport();
+}
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace cuda
+}  // namespace cuda_impl
 
-inline float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
-                    HostDeviceVector<float> const& weights) {
-  if (!ctx->IsCPU()) {
-    weights.SetDevice(ctx->gpu_id);
-    auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
-    auto t_v = t.View(ctx->gpu_id);
-    return cuda::Median(ctx, t_v, opt_weights);
-  }
+float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
+             HostDeviceVector<float> const& weights);
 
-  auto opt_weights = OptionalWeights(weights.ConstHostSpan());
-  auto t_v = t.HostView();
-  auto iter = common::MakeIndexTransformIter(
-      [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); });
-  float q{0};
-  if (opt_weights.Empty()) {
-    q = common::Quantile(0.5, iter, iter + t_v.Size());
-  } else {
-    CHECK_NE(t_v.Shape(1), 0);
-    auto w_it = common::MakeIndexTransformIter([&](size_t i) {
-      auto sample_idx = i / t_v.Shape(1);
-      return opt_weights[sample_idx];
-    });
-    q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it);
-  }
-  return q;
-}
+void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out);
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_STATS_H_
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
@@ -10,6 +10,7 @@
 #include "../common/numeric.h"
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
+#include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/tree_model.h"
 
 namespace xgboost {