rapidsai · rapids-bot · Sep 19, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
       hooks:
           - id: cython-lint
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: v16.0.1
+      rev: v16.0.6
       hooks:
           - id: clang-format
             types_or: [c, c++, cuda]

diff --git a/BUILD.md b/BUILD.md
@@ -11,7 +11,7 @@ To install cuML from source, ensure the following dependencies are met:
 5. Cython (>= 0.29)
 6. gcc (>= 9.0)
 7. BLAS - Any BLAS compatible with cmake's [FindBLAS](https://cmake.org/cmake/help/v3.14/module/FindBLAS.html). Note that the blas has to be installed to the same folder system as cmake, for example if using conda installed cmake, the blas implementation should also be installed in the conda environment.
-8. clang-format (= 16.0.1) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
+8. clang-format (= 16.0.6) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
 9. NCCL (>=2.4)
 10. UCX [optional] (>= 1.7) - enables point-to-point messaging in the cuML standard communicator. This is necessary for many multi-node multi-GPU cuML algorithms to function.
 

@@ -8,8 +8,8 @@ channels:
 - nvidia
 dependencies:
 - c-compiler
-- clang-tools==15.0.7
-- clang==15.0.7
+- clang-tools==16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-version=11.8
 - cudatoolkit

@@ -18,7 +18,7 @@ The `test` directory has subdirectories that reflect this distinction between th
 1. cmake (>= 3.26.4)
 2. CUDA (>= 11.0)
 3. gcc (>=9.3.0)
-4. clang-format (= 16.0.1) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
+4. clang-format (= 16.0.6) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
 
 ### Building cuML:
 

@@ -85,11 +85,11 @@ std::vector<Params> getInputs()
   std::vector<Params> out;
   Params p;
   p.data.rowMajor = false;
-  p.blobs         = {10.0,                  // cluster_std
-                     false,                 // shuffle
-                     -10.0,                 // center_box_min
-                     10.0,                  // center_box_max
-                     2152953ULL};           // seed
+  p.blobs         = {10.0,         // cluster_std
+                     false,        // shuffle
+                     -10.0,        // center_box_min
+                     10.0,         // center_box_max
+                     2152953ULL};  // seed
 
   p.rf = set_rf_params(10,                  /*max_depth */
                        (1 << 20),           /* max_leaves */

@@ -87,7 +87,7 @@ std::vector<RegParams> getInputs()
   p.regression    = {.shuffle        = true,  // Better to shuffle when n_informative < ncols
                      .effective_rank = -1,    // dataset generation will be faster
                      .bias           = 4.5,
-                     .tail_strength  = 0.5,   // unused when effective_rank = -1
+                     .tail_strength  = 0.5,  // unused when effective_rank = -1
                      .noise          = 1.0,
                      .seed           = 12345ULL};
 

@@ -91,9 +91,9 @@ std::vector<SvrParams<D>> getInputs()
 
   p.regression.shuffle        = true;  // better to shuffle when n_informative < ncols
   p.regression.seed           = 1378ULL;
-  p.regression.effective_rank = -1;    // dataset generation will be faster
+  p.regression.effective_rank = -1;  // dataset generation will be faster
   p.regression.bias           = 0;
-  p.regression.tail_strength  = 0.5;   // unused when effective_rank = -1
+  p.regression.tail_strength  = 0.5;  // unused when effective_rank = -1
   p.regression.noise          = 1;
 
   // SvmParameter{C, cache_size, max_iter, nochange_steps, tol, verbosity,

@@ -171,7 +171,7 @@ struct alignas(
     if constexpr (layout == tree_layout::depth_first) {
       return offset_type{1} + condition * (aligned_data.inner_data.distant_offset - offset_type{1});
     } else if constexpr (layout == tree_layout::breadth_first) {
-      return condition* offset_type{1} + (aligned_data.inner_data.distant_offset - offset_type{1});
+      return condition * offset_type{1} + (aligned_data.inner_data.distant_offset - offset_type{1});
     } else {
       static_assert(layout == tree_layout::depth_first);
     }

@@ -86,7 +86,7 @@ struct node {
     tanh,
     unary_end     = tanh,  // keep this to be the last unary function in the list
     functions_end = unary_end,
-  };                       // enum type
+  };  // enum type
 
   /**
    * @brief Default constructor for node

@@ -25,7 +25,7 @@
 
 import tomli
 
-EXPECTED_VERSION = "15.0.7"
+EXPECTED_VERSION = "16.0.6"
 VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")

@@ -285,7 +285,7 @@ struct Builder {
     d_wsize += calculateAlignedBytes(sizeof(IdxT) * max_batch * dataset.n_sampled_cols);  // colids
 
     // all nodes in the tree
-    h_wsize +=                                                     // h_workload_info
+    h_wsize +=  // h_workload_info
       calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks_dimx);
     h_wsize += calculateAlignedBytes(sizeof(SplitT) * max_batch);  // splits
 

@@ -17,20 +17,20 @@
 /** @file fil.cu fil.cu implements the forest data types (dense and sparse), including their
 creation and prediction (the main inference kernel is defined in infer.cu). */
 
-#include "common.cuh"                  // for predict_params, storage, storage
-#include "internal.cuh"                // for cat_sets_device_owner, categorical_sets, output_t,
+#include "common.cuh"    // for predict_params, storage, storage
+#include "internal.cuh"  // for cat_sets_device_owner, categorical_sets, output_t,
 
-#include <cuml/fil/fil.h>              // for algo_t,
+#include <cuml/fil/fil.h>  // for algo_t,
 
 #include <raft/core/error.hpp>         // for ASSERT
 #include <raft/core/handle.hpp>        // for handle_t
 #include <raft/util/cudart_utils.hpp>  // for RAFT_CUDA_TRY, cudaStream_t,
 #include <rmm/device_uvector.hpp>      // for device_uvector
 #include <thrust/host_vector.h>        // for host_vector
 
-#include <cmath>                       // for expf
-#include <cstddef>                     // for size_t
-#include <cstdint>                     // for uint8_t
+#include <cmath>    // for expf
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
 
 namespace ML {
 namespace fil {

@@ -22,7 +22,7 @@
 
 #include <cuml/common/logger.hpp>  // for CUML_LOG_WARN
 #include <cuml/fil/fil.h>  // for algo_t, from_treelite, storage_type_repr, storage_type_t, treelite_params_t
-#include <cuml/fil/fnv_hash.h>         // for fowler_noll_vo_fingerprint64_32
+#include <cuml/fil/fnv_hash.h>  // for fowler_noll_vo_fingerprint64_32
 
 #include <raft/core/error.hpp>         // for ASSERT
 #include <raft/core/handle.hpp>        // for handle_t
@@ -32,18 +32,18 @@
 #include <treelite/c_api.h>  // for ModelHandle
 #include <treelite/tree.h>   // for Tree, Model, ModelImpl, ModelParam
 
-#include <omp.h>             // for omp
-
-#include <algorithm>         // for std::max
-#include <bitset>            // for std::bitset
-#include <cmath>             // for NAN
-#include <cstddef>           // for std::size_t
-#include <cstdint>           // for uint8_t
-#include <iosfwd>            // for ios, stringstream
-#include <limits>            // for std::numeric_limits
-#include <stack>             // for std::stack
-#include <string>            // for std::string
-#include <type_traits>       // for std::is_same
+#include <omp.h>  // for omp
+
+#include <algorithm>    // for std::max
+#include <bitset>       // for std::bitset
+#include <cmath>        // for NAN
+#include <cstddef>      // for std::size_t
+#include <cstdint>      // for uint8_t
+#include <iosfwd>       // for ios, stringstream
+#include <limits>       // for std::numeric_limits
+#include <stack>        // for std::stack
+#include <string>       // for std::string
+#include <type_traits>  // for std::is_same
 
 namespace ML {
 namespace fil {

@@ -63,18 +63,18 @@ void weightedPearson(const raft::handle_t& h,
   rmm::device_uvector<math_t> y_tmp(n_samples, stream);
   rmm::device_uvector<math_t> x_tmp(n_samples * n_progs, stream);
 
-  rmm::device_scalar<math_t> y_mu(stream);                // output mean
-  rmm::device_uvector<math_t> x_mu(n_progs, stream);      // predicted output mean
+  rmm::device_scalar<math_t> y_mu(stream);            // output mean
+  rmm::device_uvector<math_t> x_mu(n_progs, stream);  // predicted output mean
 
   rmm::device_uvector<math_t> y_diff(n_samples, stream);  // normalized output
   rmm::device_uvector<math_t> x_diff(n_samples * n_progs,
-                                     stream);             // normalized predicted output
+                                     stream);  // normalized predicted output
 
-  rmm::device_uvector<math_t> y_std(1, stream);           // output stddev
+  rmm::device_uvector<math_t> y_std(1, stream);  // output stddev
   rmm::device_uvector<math_t> x_std(n_progs,
-                                    stream);              // predicted output stddev
+                                    stream);  // predicted output stddev
 
-  rmm::device_scalar<math_t> dWS(stream);                 // sample weight sum
+  rmm::device_scalar<math_t> dWS(stream);  // sample weight sum
   math_t N = (math_t)n_samples;
 
   // Sum of weights

@@ -192,7 +192,7 @@ struct GLMBase : GLMDims {
                         cudaStream_t stream,
                         bool initGradZero = true)
   {
-    Loss* loss = static_cast<Loss*>(this);         // static polymorphism
+    Loss* loss = static_cast<Loss*>(this);  // static polymorphism
 
     linearFwd(handle, Zb, Xb, W);                  // linear part: forward pass
     loss->getLossAndDZ(loss_val, Zb, yb, stream);  // loss specific part

@@ -53,19 +53,19 @@ enum OPT_RETCODE {
 template <typename T = double>
 class LBFGSParam {
  public:
-  int m;           // lbfgs memory limit
-  T epsilon;       // controls convergence
-  int past;        // lookback for function value based convergence test
-  T delta;         // controls fun val based conv test
+  int m;      // lbfgs memory limit
+  T epsilon;  // controls convergence
+  int past;   // lookback for function value based convergence test
+  T delta;    // controls fun val based conv test
   int max_iterations;
   int linesearch;  // see enum above
   int max_linesearch;
-  T min_step;      // min. allowed step length
-  T max_step;      // max. allowed step length
-  T ftol;          // line  search tolerance
-  T wolfe;         // wolfe parameter
-  T ls_dec;        // line search decrease factor
-  T ls_inc;        // line search increase factor
+  T min_step;  // min. allowed step length
+  T max_step;  // max. allowed step length
+  T ftol;      // line  search tolerance
+  T wolfe;     // wolfe parameter
+  T ls_dec;    // line search decrease factor
+  T ls_inc;    // line search increase factor
 
  public:
   LBFGSParam()

@@ -91,9 +91,9 @@ struct opg_knn_param {
   size_t batch_size = 0;                                  /**< Batch size */
   bool verbose;                                           /**< verbose */
 
-  std::size_t n_outputs = 0;                  /**< Number of outputs per query (cl&re) */
-  std::vector<std::vector<out_t*>>* y;        /**< Labels input array (cl&re) */
-  std::vector<Matrix::Data<out_t>*>* out;     /**< KNN outputs output array (cl&re) */
+  std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
+  std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
+  std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
 
   std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
   std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */

@@ -306,7 +306,7 @@ class Results {
   SvmType svmType;        //!< SVM problem type: SVC or SVR
   int n_train;            //!< number of training vectors (including duplicates for SVR)
 
-  const int TPB = 256;    // threads per block
+  const int TPB = 256;  // threads per block
   // Temporary variables used by cub in GetResults
   rmm::device_scalar<int> d_num_selected;
   rmm::device_scalar<math_t> d_val_reduced;

@@ -471,7 +471,7 @@ class SmoSolver {
   rmm::device_uvector<math_t> f;        //!< optimality indicator vector
   rmm::device_uvector<math_t> y_label;  //!< extra label for regression
 
-  rmm::device_uvector<math_t> C_vec;    //!< penalty parameter vector
+  rmm::device_uvector<math_t> C_vec;  //!< penalty parameter vector
 
   // Buffers for the working set [n_ws]
   //! change in alpha parameter during a blocksolve step
@@ -490,7 +490,7 @@ class SmoSolver {
   raft::distance::kernels::KernelType kernel_type;
   float cache_size;  //!< size of kernel cache in MiB
 
-  SvmType svmType;   ///!< Type of the SVM problem to solve
+  SvmType svmType;  ///!< Type of the SVM problem to solve
 
   // Variables to track convergence of training
   math_t diff_prev;

@@ -542,7 +542,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(value_idx* restr
  */
 template <typename value_idx, typename value_t>
 __global__ __launch_bounds__(
-  THREADS5, 1) void RepulsionKernel(                     /* int *restrict errd, */
+  THREADS5, 1) void RepulsionKernel(/* int *restrict errd, */
                                     const float theta,
                                     const float epssqd,  // correction for zero distance
                                     const value_idx* restrict sortd,

@@ -47,7 +47,7 @@ Emerald Edition, pp. 75-92. January 2011.
 // threads per block
 #define THREADS1 1024 /* must be a power of 2 */
 #define THREADS2 1024
-#define THREADS3 768  /* shared-memory limited on some devices */
+#define THREADS3 768 /* shared-memory limited on some devices */
 #define THREADS4 1024
 #define THREADS5 1024
 #define THREADS6 1024
@@ -562,7 +562,7 @@ __global__ __launch_bounds__(THREADS4,
           }
         }
       }
-      k -= dec;       // move on to next cell
+      k -= dec;  // move on to next cell
     }
     __syncthreads();  // optional barrier for performance
   }

@@ -290,8 +290,8 @@ __global__ void repulsive_kernel(const value_t* restrict Y,
                                  value_t* restrict Z_sum2,
                                  const value_idx n,
                                  const value_idx dim,
-                                 const value_t df_power,   // -(df + 1)/2)
-                                 const value_t recp_df)    // 1 / df
+                                 const value_t df_power,  // -(df + 1)/2)
+                                 const value_t recp_df)   // 1 / df
 {
   const auto j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   const auto i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row

@@ -194,7 +194,7 @@ __global__ void compute_membership_strength_kernel(
   const value_idx* knn_indices,
   const float* knn_dists,  // nn outputs
   const value_t* sigmas,
-  const value_t* rhos,     // continuous dists to nearest neighbors
+  const value_t* rhos,  // continuous dists to nearest neighbors
   value_t* vals,
   int* rows,
   int* cols,  // result coo

@@ -158,7 +158,7 @@ __global__ void _undiff_kernel(DataT* d_fc,
     for (int i = 0; i < num_steps; i++) {
       if (!double_diff) {  // One simple or seasonal difference
         b_fc[i] += _select_read(b_in, n_in, b_fc, i - s0);
-      } else {             // Two differences (simple, seasonal or both)
+      } else {  // Two differences (simple, seasonal or both)
         DataT fc_acc = -_select_read(b_in, n_in, b_fc, i - s0 - s1);
         fc_acc += _select_read(b_in, n_in, b_fc, i - s0);
         fc_acc += _select_read(b_in, n_in, b_fc, i - s1);

@@ -38,7 +38,7 @@ template <typename T>
 struct CSRInputs {
   CSROperation operation;
   int batch_size;
-  int m;    // Dimensions of A
+  int m;  // Dimensions of A
   int n;
   int nnz;  // Number of non-zero elements in A
   int p;    // Dimensions of B or x

@@ -206,7 +206,7 @@ std::vector<ChildIndexTestParams> params = {
   CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = QNAN, correct = 1),  // !def_left
   CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = QNAN, correct = 2),    // !def_left
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1),         // !def_left
+    node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1),      // !def_left
   CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = 0.0, correct = 1),  // val !>= thresh
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(thresh = 0.0), parent_node_idx = 1, input = -INF, correct = 3),
@@ -224,7 +224,7 @@ std::vector<ChildIndexTestParams> params = {
     node = NODE(thresh = 0.0), parent_node_idx = 4, input = -INF, correct = 9),
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(thresh = 0.0), parent_node_idx = 4, input = 0.0, correct = 10),
-  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10),         // !def_left
+  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10),  // !def_left
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(def_left = true), input = QNAN, parent_node_idx = 4, correct = 9),  // !def_left
   // cannot match ( < 0 and realistic fid_num_cats)

@@ -95,8 +95,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - clang==15.0.7
-          - clang-tools==15.0.7
+          - clang==16.0.6
+          - clang-tools==16.0.6
           - ninja
           - tomli
   common_build: