From 77c294b5fde171e83025d5fa97cff3fcbd5aff26 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 13 Apr 2023 11:28:47 -0500
Subject: [PATCH 1/6] Update clang-format version.

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8a5fb46d3d..5c5b291dad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,7 @@ repos:
             name: clang-format
             entry: python ./cpp/scripts/run-clang-format.py
             language: python
-            additional_dependencies: [clang-format==11.1.0]
+            additional_dependencies: [clang-format==16.0.1]
           - id: copyright-check
             name: copyright-check
             entry: python ./ci/checks/copyright.py --fix-in-place

From 7e96fe9b71d1a7744cb9f1daa9daed3c0ffd35bc Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 13 Apr 2023 11:50:46 -0500
Subject: [PATCH 2/6] Update specific references in cuml docs and scripts.

---
 BUILD.md                        | 2 +-
 cpp/README.md                   | 2 +-
 cpp/scripts/run-clang-format.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 9f1044c168..ed04f300ac 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -11,7 +11,7 @@ To install cuML from source, ensure the following dependencies are met:
 5. Cython (>= 0.29)
 6. gcc (>= 9.0)
 7. BLAS - Any BLAS compatible with cmake's [FindBLAS](https://cmake.org/cmake/help/v3.14/module/FindBLAS.html). Note that the blas has to be installed to the same folder system as cmake, for example if using conda installed cmake, the blas implementation should also be installed in the conda environment.
-8. clang-format (= 11.1.0) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=8` and `clang-tools=8` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
+8. clang-format (= 16.0.1) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
 9. NCCL (>=2.4)
 10. UCX [optional] (>= 1.7) - enables point-to-point messaging in the cuML standard communicator. This is necessary for many multi-node multi-GPU cuML algorithms to function.
 
diff --git a/cpp/README.md b/cpp/README.md
index 68ad105e61..5ae444ccc7 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -18,7 +18,7 @@ The `test` directory has subdirectories that reflect this distinction between th
 1. cmake (>= 3.23.1)
 2. CUDA (>= 11.0)
 3. gcc (>=9.3.0)
-4. clang-format (= 11.1.0) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=11` and `clang-tools=11` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
+4. clang-format (= 16.0.1) - enforces uniform C++ coding style; required to build cuML from source. The packages `clang=16` and `clang-tools=16` from the conda-forge channel should be sufficient, if you are on conda. If not using conda, install the right version using your OS package manager.
 
 ### Building cuML:
 
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index dc933056c8..b319e32e26 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -23,7 +23,7 @@
 import shutil
 
 
-EXPECTED_VERSION = "11.1.0"
+EXPECTED_VERSION = "16.0.1"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to
 #       to the cuml repo

From 5de55920c7e6cc5de8d635f293a89ee62fb2cab0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 14 Apr 2023 14:39:55 -0500
Subject: [PATCH 3/6] Use pre-commit hook instead of local script.

---
 .pre-commit-config.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5c5b291dad..2c334e69cc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,6 +18,12 @@ repos:
             types_or: [python, cython]
             exclude: thirdparty
             additional_dependencies: [flake8-force]
+    - repo: https://github.com/pre-commit/mirrors-clang-format
+      rev: v16.0.1
+      hooks:
+          - id: clang-format
+            types_or: [c, c++, cuda]
+            args: ["-fallback-style=none", "-style=file", "-i"]
     - repo: https://github.com/codespell-project/codespell
       rev: v2.2.2
       hooks:
@@ -33,11 +39,6 @@ repos:
             entry: '(category=|\s)DeprecationWarning[,)]'
             language: pygrep
             types_or: [python, cython]
-          - id: clang-format
-            name: clang-format
-            entry: python ./cpp/scripts/run-clang-format.py
-            language: python
-            additional_dependencies: [clang-format==16.0.1]
           - id: copyright-check
             name: copyright-check
             entry: python ./ci/checks/copyright.py --fix-in-place

From d82364793f6e89f8781786a6eec8f9981337d208 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 14 Apr 2023 14:41:18 -0500
Subject: [PATCH 4/6] Apply clang-format.

---
 cpp/bench/sg/fil.cu                           |  34 +-
 cpp/bench/sg/filex.cu                         | 170 ++---
 cpp/bench/sg/rf_classifier.cu                 |  15 +-
 cpp/bench/sg/rf_regressor.cu                  |  15 +-
 cpp/bench/sg/svr.cu                           |   4 +-
 .../cuml/common/pinned_host_vector.hpp        |   8 +-
 .../cuml/experimental/fil/constants.hpp       |  36 +-
 .../cuml/experimental/fil/decision_forest.hpp | 404 +++++------
 .../cuml/experimental/fil/detail/bitset.hpp   |  78 +--
 .../fil/detail/cpu_introspection.hpp          |  10 +-
 .../fil/detail/decision_forest_builder.hpp    | 209 +++---
 .../fil/detail/device_initialization.hpp      |  28 +-
 .../fil/detail/device_initialization/cpu.hpp  |  21 +-
 .../fil/detail/device_initialization/gpu.cuh  | 499 ++++++--------
 .../fil/detail/device_initialization/gpu.hpp  |  19 +-
 .../experimental/fil/detail/evaluate_tree.hpp |  56 +-
 .../cuml/experimental/fil/detail/forest.hpp   |  51 +-
 .../fil/detail/gpu_introspection.hpp          |  82 +--
 .../experimental/fil/detail/index_type.hpp    |   4 +-
 .../cuml/experimental/fil/detail/infer.hpp    | 217 +++---
 .../experimental/fil/detail/infer/cpu.hpp     |  65 +-
 .../experimental/fil/detail/infer/gpu.cuh     | 389 +++++------
 .../experimental/fil/detail/infer/gpu.hpp     |  41 +-
 .../fil/detail/infer_kernel/cpu.hpp           | 171 ++---
 .../fil/detail/infer_kernel/gpu.cuh           | 214 ++----
 .../infer_kernel/shared_memory_buffer.cuh     |  55 +-
 .../cuml/experimental/fil/detail/node.hpp     | 180 +++--
 .../experimental/fil/detail/postprocessor.hpp | 366 +++++-----
 .../fil/detail/raft_proto/buffer.hpp          | 419 ++++++------
 .../fil/detail/raft_proto/ceildiv.hpp         |   5 +-
 .../fil/detail/raft_proto/cuda_check.hpp      |   5 +-
 .../fil/detail/raft_proto/cuda_stream.hpp     |   5 +-
 .../raft_proto/detail/const_agnostic.hpp      |   2 +-
 .../fil/detail/raft_proto/detail/copy.hpp     |  64 +-
 .../fil/detail/raft_proto/detail/copy/cpu.hpp |  24 +-
 .../fil/detail/raft_proto/detail/copy/gpu.hpp |  16 +-
 .../raft_proto/detail/cuda_check/base.hpp     |   7 +-
 .../raft_proto/detail/cuda_check/gpu.hpp      |   7 +-
 .../raft_proto/detail/device_id/base.hpp      |   6 +-
 .../raft_proto/detail/device_id/cpu.hpp       |   5 +-
 .../raft_proto/detail/device_id/gpu.hpp       |  18 +-
 .../raft_proto/detail/device_setter/base.hpp  |   6 +-
 .../raft_proto/detail/device_setter/gpu.hpp   |  23 +-
 .../raft_proto/detail/host_only_throw.hpp     |   2 +-
 .../detail/host_only_throw/base.hpp           |   9 +-
 .../raft_proto/detail/host_only_throw/cpu.hpp |  11 +-
 .../raft_proto/detail/non_owning_buffer.hpp   |   4 +-
 .../detail/non_owning_buffer/base.hpp         |  11 +-
 .../raft_proto/detail/owning_buffer.hpp       |   4 +-
 .../raft_proto/detail/owning_buffer/base.hpp  |   6 +-
 .../raft_proto/detail/owning_buffer/cpu.hpp   |  19 +-
 .../raft_proto/detail/owning_buffer/gpu.hpp   |  20 +-
 .../fil/detail/raft_proto/device_id.hpp       |   2 +-
 .../fil/detail/raft_proto/device_type.hpp     |   5 +-
 .../fil/detail/raft_proto/exceptions.hpp      |  16 +-
 .../fil/detail/raft_proto/gpu_support.hpp     |  10 +-
 .../fil/detail/raft_proto/handle.hpp          |  35 +-
 .../fil/detail/raft_proto/padding.hpp         |  20 +-
 .../fil/detail/specialization_types.hpp       |  45 +-
 .../device_initialization_macros.hpp          |   8 +-
 .../detail/specializations/forest_macros.hpp  |  14 +-
 .../detail/specializations/infer_macros.hpp   | 163 ++---
 .../cuml/experimental/fil/exceptions.hpp      |  19 +-
 .../cuml/experimental/fil/forest_model.hpp    | 310 ++++-----
 .../cuml/experimental/fil/infer_kind.hpp      |  10 +-
 .../cuml/experimental/fil/postproc_ops.hpp    |  24 +-
 .../cuml/experimental/fil/tree_layout.hpp     |   9 +-
 .../experimental/fil/treelite_importer.hpp    | 564 +++++++---------
 cpp/include/cuml/genetic/node.h               |   4 +-
 cpp/include/cuml/metrics/metrics.hpp          |   6 +-
 cpp/src/arima/batched_kalman.cu               |  14 +-
 .../batched-levelalgo/builder.cuh             |   2 +-
 cpp/src/experimental/fil/infer0.cpp           |   8 +-
 cpp/src/experimental/fil/infer0.cu            |  10 +-
 cpp/src/experimental/fil/infer1.cpp           |   8 +-
 cpp/src/experimental/fil/infer1.cu            |  10 +-
 cpp/src/experimental/fil/infer2.cpp           |   8 +-
 cpp/src/experimental/fil/infer2.cu            |  10 +-
 cpp/src/experimental/fil/infer3.cpp           |   8 +-
 cpp/src/experimental/fil/infer3.cu            |  10 +-
 cpp/src/experimental/fil/infer4.cpp           |   8 +-
 cpp/src/experimental/fil/infer4.cu            |  10 +-
 cpp/src/experimental/fil/infer5.cpp           |   8 +-
 cpp/src/experimental/fil/infer5.cu            |  10 +-
 cpp/src/experimental/fil/infer6.cpp           |   8 +-
 cpp/src/experimental/fil/infer6.cu            |  10 +-
 cpp/src/experimental/fil/infer7.cpp           |   8 +-
 cpp/src/experimental/fil/infer7.cu            |  10 +-
 cpp/src/explainer/tree_shap.cu                |   2 +-
 cpp/src/fil/fil.cu                            |  14 +-
 cpp/src/fil/internal.cuh                      |   3 +-
 cpp/src/fil/treelite_import.cu                |  30 +-
 cpp/src/genetic/fitness.cuh                   |  14 +-
 cpp/src/genetic/genetic.cu                    |   4 +-
 cpp/src/glm/qn/glm_base.cuh                   |   2 +-
 cpp/src/glm/qn/qn_util.cuh                    |  22 +-
 cpp/src/hdbscan/detail/soft_clustering.cuh    | 281 ++++----
 cpp/src/hdbscan/detail/utils.h                |  30 +-
 cpp/src/hierarchy/linkage.cu                  |   8 +-
 cpp/src/knn/knn_opg_common.cuh                |   6 +-
 cpp/src/solver/sgd.cuh                        |   4 +-
 cpp/src/svm/kernelcache.cuh                   |   8 +-
 cpp/src/svm/results.cuh                       |  14 +-
 cpp/src/svm/smosolver.cuh                     |   4 +-
 cpp/src/tsne/barnes_hut_kernels.cuh           |   2 +-
 cpp/src/tsne/cannylab/bh.cu                   | 629 +++++++++++-------
 cpp/src/tsne/exact_kernels.cuh                |   4 +-
 cpp/src/umap/fuzzy_simpl_set/naive.cuh        |   2 +-
 cpp/src_prims/timeSeries/arima_helpers.cuh    |   4 +-
 cpp/test/c_api/dbscan_api_test.c              |  12 +-
 cpp/test/c_api/glm_api_test.c                 |  39 +-
 cpp/test/c_api/holtwinters_api_test.c         |  20 +-
 cpp/test/c_api/knn_api_test.c                 |  11 +-
 cpp/test/c_api/svm_api_test.c                 |  60 +-
 cpp/test/prims/batched/csr.cu                 |   4 +-
 cpp/test/sg/fil_child_index_test.cu           |   6 +-
 cpp/test/sg/lars_test.cu                      |  18 +-
 cpp/test/sg/rf_test.cu                        |  10 +-
 cpp/test/sg/tsne_test.cu                      |   2 +-
 python/cuml/internals/callbacks_implems.h     |  90 ++-
 120 files changed, 3303 insertions(+), 3619 deletions(-)

diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index 67017fd9f5..09efc1dfa3 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,16 +43,16 @@ class FIL : public RegressionFixture<float> {
 
  public:
   FIL(const std::string& name, const Params& p)
-  /*
-        fitting to linear combinations in "y" normally yields trees that check
-        values of all significant columns, as well as their linear
-        combinations in "X". During inference, the exact threshold
-        values do not affect speed. The distribution of column popularity does
-        not affect speed barring lots of uninformative columns in succession.
-        Hence, this method represents real datasets well enough for both
-        classification and regression.
-      */
-  : RegressionFixture<float>(name, p.data, p.blobs), model(p.model), p_rest(p)
+    /*
+          fitting to linear combinations in "y" normally yields trees that check
+          values of all significant columns, as well as their linear
+          combinations in "X". During inference, the exact threshold
+          values do not affect speed. The distribution of column popularity does
+          not affect speed barring lots of uninformative columns in succession.
+          Hence, this method represents real datasets well enough for both
+          classification and regression.
+        */
+    : RegressionFixture<float>(name, p.data, p.blobs), model(p.model), p_rest(p)
   {
   }
 
@@ -140,12 +140,12 @@ std::vector<Params> getInputs()
   Params p;
   p.data.rowMajor = true;
   p.blobs         = {.n_informative  = -1,  // Just a placeholder value, anyway changed below
-             .effective_rank = -1,  // Just a placeholder value, anyway changed below
-             .bias           = 0.f,
-             .tail_strength  = 0.1,
-             .noise          = 0.01,
-             .shuffle        = false,
-             .seed           = 12345ULL};
+                     .effective_rank = -1,  // Just a placeholder value, anyway changed below
+                     .bias           = 0.f,
+                     .tail_strength  = 0.1,
+                     .noise          = 0.01,
+                     .shuffle        = false,
+                     .seed           = 12345ULL};
 
   p.rf = set_rf_params(10,                 /*max_depth */
                        (1 << 20),          /* max_leaves */
diff --git a/cpp/bench/sg/filex.cu b/cpp/bench/sg/filex.cu
index 781e3debfd..048d89c3d9 100644
--- a/cpp/bench/sg/filex.cu
+++ b/cpp/bench/sg/filex.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <cuml/fil/fil.h>
-#include <cuml/experimental/fil/infer_kind.hpp>
-#include <cuml/experimental/fil/treelite_importer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/tree_layout.hpp>
+#include <cuml/experimental/fil/treelite_importer.hpp>
+#include <cuml/fil/fil.h>
 
 #include "benchmark.cuh"
 #include <chrono>
@@ -49,7 +49,7 @@ class FILEX : public RegressionFixture<float> {
 
  public:
   FILEX(const std::string& name, const Params& p)
-  : RegressionFixture<float>(name, p.data, p.blobs), model(p.model), p_rest(p)
+    : RegressionFixture<float>(name, p.data, p.blobs), model(p.model), p_rest(p)
   {
   }
 
@@ -59,7 +59,7 @@ class FILEX : public RegressionFixture<float> {
     if (!params.rowMajor) { state.SkipWithError("FIL only supports row-major inputs"); }
     // create model
     ML::RandomForestRegressorF rf_model;
-    auto* mPtr         = &rf_model;
+    auto* mPtr       = &rf_model;
     auto train_nrows = std::min(params.nrows, 1000);
     fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf);
     handle->sync_stream(stream);
@@ -73,8 +73,7 @@ class FILEX : public RegressionFixture<float> {
       false,
       raft_proto::device_type::gpu,
       0,
-      stream
-    );
+      stream);
 
     ML::fil::treelite_params_t tl_params = {
       .algo              = ML::fil::algo_t::NAIVE,
@@ -86,10 +85,10 @@ class FILEX : public RegressionFixture<float> {
       .n_items           = 0,
       .pforest_shape_str = nullptr};
     ML::fil::forest_variant forest_variant;
-    auto optimal_chunk_size = 1;
-    auto optimal_storage_type = p_rest.storage;
-    auto optimal_algo_type = ML::fil::algo_t::NAIVE;
-    auto optimal_layout = ML::experimental::fil::tree_layout::breadth_first;
+    auto optimal_chunk_size    = 1;
+    auto optimal_storage_type  = p_rest.storage;
+    auto optimal_algo_type     = ML::fil::algo_t::NAIVE;
+    auto optimal_layout        = ML::experimental::fil::tree_layout::breadth_first;
     auto allowed_storage_types = std::vector<ML::fil::storage_type_t>{};
     if (p_rest.storage == ML::fil::storage_type_t::DENSE) {
       allowed_storage_types.push_back(ML::fil::storage_type_t::DENSE);
@@ -119,14 +118,7 @@ class FILEX : public RegressionFixture<float> {
         tl_params.algo = algo_type;
         for (auto layout : allowed_layouts) {
           filex_model = ML::experimental::fil::import_from_treelite_handle(
-            model,
-            layout,
-            128,
-            false,
-            raft_proto::device_type::gpu,
-            0,
-            stream
-          );
+            model, layout, 128, false, raft_proto::device_type::gpu, 0, stream);
           for (auto chunk_size = 1; chunk_size <= 32; chunk_size *= 2) {
             if (!p_rest.use_experimental) {
               tl_params.threads_per_tree = chunk_size;
@@ -139,104 +131,83 @@ class FILEX : public RegressionFixture<float> {
             for (int i = 0; i < p_rest.predict_repetitions; i++) {
               // Create FIL forest
               if (p_rest.use_experimental) {
-                filex_model.predict(
-                  *handle,
-                  data.y.data(),
-                  data.X.data(),
-                  params.nrows,
-                  raft_proto::device_type::gpu,
-                  raft_proto::device_type::gpu,
-                  ML::experimental::fil::infer_kind::default_kind,
-                  chunk_size
-                );
+                filex_model.predict(*handle,
+                                    data.y.data(),
+                                    data.X.data(),
+                                    params.nrows,
+                                    raft_proto::device_type::gpu,
+                                    raft_proto::device_type::gpu,
+                                    ML::experimental::fil::infer_kind::default_kind,
+                                    chunk_size);
               } else {
-                ML::fil::predict(*handle,
-                                 forest,
-                                 data.y.data(),
-                                 data.X.data(),
-                                 params.nrows,
-                                 false);
+                ML::fil::predict(
+                  *handle, forest, data.y.data(), data.X.data(), params.nrows, false);
               }
             }
             handle->sync_stream();
             handle->sync_stream_pool();
             auto end = std::chrono::high_resolution_clock::now();
-            auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
-              end - start
-            ).count();
+            auto elapsed =
+              std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
             if (elapsed < min_time) {
-              min_time = elapsed;
-              optimal_chunk_size = chunk_size;
+              min_time             = elapsed;
+              optimal_chunk_size   = chunk_size;
               optimal_storage_type = storage_type;
-              optimal_algo_type = algo_type;
-              optimal_layout = layout;
+              optimal_algo_type    = algo_type;
+              optimal_layout       = layout;
             }
 
             // Clean up from FIL
-            if (!p_rest.use_experimental) {
-              ML::fil::free(*handle, forest);
-            }
-          }
-          if (!p_rest.use_experimental) {
-            break;
+            if (!p_rest.use_experimental) { ML::fil::free(*handle, forest); }
           }
+          if (!p_rest.use_experimental) { break; }
         }
-        if (p_rest.use_experimental) {
-          break;
-        }
-      }
-      if (p_rest.use_experimental) {
-        break;
+        if (p_rest.use_experimental) { break; }
       }
+      if (p_rest.use_experimental) { break; }
     }
 
     // Build optimal FIL tree
-    tl_params.storage_type = optimal_storage_type;
-    tl_params.algo = optimal_algo_type;
+    tl_params.storage_type     = optimal_storage_type;
+    tl_params.algo             = optimal_algo_type;
     tl_params.threads_per_tree = optimal_chunk_size;
     ML::fil::from_treelite(*handle, &forest_variant, model, &tl_params);
-    forest = std::get<ML::fil::forest_t<float>>(forest_variant);
+    forest      = std::get<ML::fil::forest_t<float>>(forest_variant);
     filex_model = ML::experimental::fil::import_from_treelite_handle(
-      model,
-      optimal_layout,
-      128,
-      false,
-      raft_proto::device_type::gpu,
-      0,
-      stream
-    );
+      model, optimal_layout, 128, false, raft_proto::device_type::gpu, 0, stream);
 
     handle->sync_stream();
     handle->sync_stream_pool();
 
     // only time prediction
-    this->loopOnState(state, [this, &filex_model, optimal_chunk_size]() {
-      for (int i = 0; i < p_rest.predict_repetitions; i++) {
-        if (p_rest.use_experimental) {
-          filex_model.predict(
-            *handle,
-            this->data.y.data(),
-            this->data.X.data(),
-            this->params.nrows,
-            raft_proto::device_type::gpu,
-            raft_proto::device_type::gpu,
-            ML::experimental::fil::infer_kind::default_kind,
-            optimal_chunk_size
-          );
-          handle->sync_stream();
-          handle->sync_stream_pool();
-        } else {
-          ML::fil::predict(*this->handle,
-                           this->forest,
-                           this->data.y.data(),
-                           this->data.X.data(),
-                           this->params.nrows,
-                           false);
-          handle->sync_stream();
-          handle->sync_stream_pool();
+    this->loopOnState(
+      state,
+      [this, &filex_model, optimal_chunk_size]() {
+        for (int i = 0; i < p_rest.predict_repetitions; i++) {
+          if (p_rest.use_experimental) {
+            filex_model.predict(*handle,
+                                this->data.y.data(),
+                                this->data.X.data(),
+                                this->params.nrows,
+                                raft_proto::device_type::gpu,
+                                raft_proto::device_type::gpu,
+                                ML::experimental::fil::infer_kind::default_kind,
+                                optimal_chunk_size);
+            handle->sync_stream();
+            handle->sync_stream_pool();
+          } else {
+            ML::fil::predict(*this->handle,
+                             this->forest,
+                             this->data.y.data(),
+                             this->data.X.data(),
+                             this->params.nrows,
+                             false);
+            handle->sync_stream();
+            handle->sync_stream_pool();
+          }
         }
-      }
-    }, true);
+      },
+      true);
   }
 
   void allocateBuffers(const ::benchmark::State& state) override { Base::allocateBuffers(state); }
@@ -269,12 +240,12 @@ std::vector<Params> getInputs()
   Params p;
   p.data.rowMajor = true;
   p.blobs         = {.n_informative  = -1,  // Just a placeholder value, anyway changed below
-             .effective_rank = -1,  // Just a placeholder value, anyway changed below
-             .bias           = 0.f,
-             .tail_strength  = 0.1,
-             .noise          = 0.01,
-             .shuffle        = false,
-             .seed           = 12345ULL};
+                     .effective_rank = -1,  // Just a placeholder value, anyway changed below
+                     .bias           = 0.f,
+                     .tail_strength  = 0.1,
+                     .noise          = 0.01,
+                     .shuffle        = false,
+                     .seed           = 12345ULL};
 
   p.rf = set_rf_params(10,                 /*max_depth */
                        (1 << 20),          /* max_leaves */
@@ -306,8 +277,7 @@ std::vector<Params> getInputs()
     {(int)1e6, 20, 1, 10, 10000, storage_type_t::DENSE, false},
     {(int)1e6, 20, 1, 10, 10000, storage_type_t::DENSE, true},
     {(int)1e6, 200, 1, 10, 1000, storage_type_t::DENSE, false},
-    {(int)1e6, 200, 1, 10, 1000, storage_type_t::DENSE, true}
-  };
+    {(int)1e6, 200, 1, 10, 1000, storage_type_t::DENSE, true}};
   for (auto& i : var_params) {
     p.data.nrows               = i.nrows;
     p.data.ncols               = i.ncols;
@@ -326,6 +296,6 @@ std::vector<Params> getInputs()
 
 ML_BENCH_REGISTER(Params, FILEX, "", getInputs());
 
-}  // end namespace fil
+}  // namespace filex
 }  // end namespace Bench
 }  // end namespace ML
diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu
index 0141bb2798..9f3a9b1a7d 100644
--- a/cpp/bench/sg/rf_classifier.cu
+++ b/cpp/bench/sg/rf_classifier.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,7 @@ struct Params {
 };
 
 template <typename D>
-struct RFClassifierModel {
-};
+struct RFClassifierModel {};
 
 template <>
 struct RFClassifierModel<float> {
@@ -86,11 +85,11 @@ std::vector<Params> getInputs()
   std::vector<Params> out;
   Params p;
   p.data.rowMajor = false;
-  p.blobs         = {10.0,         // cluster_std
-             false,        // shuffle
-             -10.0,        // center_box_min
-             10.0,         // center_box_max
-             2152953ULL};  // seed
+  p.blobs         = {10.0,                  // cluster_std
+                     false,                 // shuffle
+                     -10.0,                 // center_box_min
+                     10.0,                  // center_box_max
+                     2152953ULL};           // seed
 
   p.rf = set_rf_params(10,                  /*max_depth */
                        (1 << 20),           /* max_leaves */
diff --git a/cpp/bench/sg/rf_regressor.cu b/cpp/bench/sg/rf_regressor.cu
index 2985d7fcf6..20592b914a 100644
--- a/cpp/bench/sg/rf_regressor.cu
+++ b/cpp/bench/sg/rf_regressor.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,7 @@ struct RegParams {
 };
 
 template <typename D>
-struct RFRegressorModel {
-};
+struct RFRegressorModel {};
 
 template <>
 struct RFRegressorModel<float> {
@@ -86,11 +85,11 @@ std::vector<RegParams> getInputs()
   RegParams p;
   p.data.rowMajor = false;
   p.regression    = {.shuffle        = true,  // Better to shuffle when n_informative < ncols
-                  .effective_rank = -1,    // dataset generation will be faster
-                  .bias           = 4.5,
-                  .tail_strength  = 0.5,  // unused when effective_rank = -1
-                  .noise          = 1.0,
-                  .seed           = 12345ULL};
+                     .effective_rank = -1,    // dataset generation will be faster
+                     .bias           = 4.5,
+                     .tail_strength  = 0.5,   // unused when effective_rank = -1
+                     .noise          = 1.0,
+                     .seed           = 12345ULL};
 
   p.rf                          = set_rf_params(10,                 /*max_depth */
                        (1 << 20),          /* max_leaves */
diff --git a/cpp/bench/sg/svr.cu b/cpp/bench/sg/svr.cu
index 22185d40bc..b03c38b15e 100644
--- a/cpp/bench/sg/svr.cu
+++ b/cpp/bench/sg/svr.cu
@@ -95,9 +95,9 @@ std::vector<SvrParams<D>> getInputs()
 
   p.regression.shuffle        = true;  // better to shuffle when n_informative < ncols
   p.regression.seed           = 1378ULL;
-  p.regression.effective_rank = -1;  // dataset generation will be faster
+  p.regression.effective_rank = -1;    // dataset generation will be faster
   p.regression.bias           = 0;
-  p.regression.tail_strength  = 0.5;  // unused when effective_rank = -1
+  p.regression.tail_strength  = 0.5;   // unused when effective_rank = -1
   p.regression.noise          = 1;
 
   // SvmParameter{C, cache_size, max_iter, nochange_steps, tol, verbosity,
diff --git a/cpp/include/cuml/common/pinned_host_vector.hpp b/cpp/include/cuml/common/pinned_host_vector.hpp
index b29527a893..768bcb3b4e 100644
--- a/cpp/include/cuml/common/pinned_host_vector.hpp
+++ b/cpp/include/cuml/common/pinned_host_vector.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,10 +32,10 @@ class pinned_host_vector {
   }
   ~pinned_host_vector() { pinned_mr.deallocate(data_, size_ * sizeof(T)); }
 
-  pinned_host_vector(pinned_host_vector const&) = delete;
-  pinned_host_vector(pinned_host_vector&&)      = delete;
+  pinned_host_vector(pinned_host_vector const&)            = delete;
+  pinned_host_vector(pinned_host_vector&&)                 = delete;
   pinned_host_vector& operator=(pinned_host_vector const&) = delete;
-  pinned_host_vector& operator=(pinned_host_vector&&) = delete;
+  pinned_host_vector& operator=(pinned_host_vector&&)      = delete;
 
   void resize(std::size_t n)
   {
diff --git a/cpp/include/cuml/experimental/fil/constants.hpp b/cpp/include/cuml/experimental/fil/constants.hpp
index 73b7ea6840..3859560aea 100644
--- a/cpp/include/cuml/experimental/fil/constants.hpp
+++ b/cpp/include/cuml/experimental/fil/constants.hpp
@@ -19,22 +19,22 @@
 namespace ML {
 namespace experimental {
 namespace fil {
-  /**
-   * The default memory layout for FIL trees if not otherwise specified
-   */
-  auto constexpr static const preferred_tree_layout = tree_layout::breadth_first;
-  /**
-   * The number of bits used for flags in node metadata
-   *
-   * Each node in a FIL tree must specify the feature used for its split in
-   * addition to some other basic information. The feature ID is "packed"
-   * with a few flags in order to reduce the size of the node. This constant
-   * indicates how many leading bits are reserved for flags to allow import
-   * functions to assess how much space is required for the whole metadata
-   * field.
-   */
-  auto constexpr static const reserved_node_metadata_bits = 3;
+/**
+ * The default memory layout for FIL trees if not otherwise specified
+ */
+auto constexpr static const preferred_tree_layout = tree_layout::breadth_first;
+/**
+ * The number of bits used for flags in node metadata
+ *
+ * Each node in a FIL tree must specify the feature used for its split in
+ * addition to some other basic information. The feature ID is "packed"
+ * with a few flags in order to reduce the size of the node. This constant
+ * indicates how many leading bits are reserved for flags to allow import
+ * functions to assess how much space is required for the whole metadata
+ * field.
+ */
+auto constexpr static const reserved_node_metadata_bits = 3;
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/decision_forest.hpp b/cpp/include/cuml/experimental/fil/decision_forest.hpp
index 7b86c07b86..73b314c94e 100644
--- a/cpp/include/cuml/experimental/fil/decision_forest.hpp
+++ b/cpp/include/cuml/experimental/fil/decision_forest.hpp
@@ -14,26 +14,26 @@
  * limitations under the License.
  */
 #pragma once
-#include <stddef.h>
-#include <stdint.h>
 #include <algorithm>
 #include <cstddef>
 #include <cuml/experimental/fil/constants.hpp>
-#include <cuml/experimental/fil/postproc_ops.hpp>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/device_initialization.hpp>
+#include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/infer.hpp>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
-#include <cuml/experimental/fil/detail/specialization_types.hpp>
-#include <cuml/experimental/fil/exceptions.hpp>
-#include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/exceptions.hpp>
+#include <cuml/experimental/fil/detail/specialization_types.hpp>
+#include <cuml/experimental/fil/exceptions.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <cuml/experimental/fil/postproc_ops.hpp>
 #include <cuml/experimental/fil/tree_layout.hpp>
 #include <limits>
 #include <optional>
+#include <stddef.h>
+#include <stdint.h>
 #include <variant>
 
 namespace ML {
@@ -63,9 +63,12 @@ namespace fil {
  * its most distant child. This type must be large enough to store the
  * largest such offset in the entire forest.
  */
-template <tree_layout layout_v, typename threshold_t, typename index_t, typename metadata_storage_t, typename offset_t>
+template <tree_layout layout_v,
+          typename threshold_t,
+          typename index_t,
+          typename metadata_storage_t,
+          typename offset_t>
 struct decision_forest {
-
   /**
    * The in-memory layout of nodes in this forest
    */
@@ -74,13 +77,7 @@ struct decision_forest {
    * The type of the forest object which is actually passed to the CPU/GPU
    * for inference
    */
-  using forest_type = forest<
-    layout,
-    threshold_t,
-    index_t,
-    metadata_storage_t,
-    offset_t
-  >;
+  using forest_type = forest<layout, threshold_t, index_t, metadata_storage_t, offset_t>;
   /**
    * The type of nodes within the forest
    */
@@ -105,20 +102,22 @@ struct decision_forest {
   /**
    * Construct an empty decision forest
    */
-  decision_forest() :
-    nodes_{},
-    root_node_indexes_{},
-    vector_output_{},
-    categorical_storage_{},
-    num_features_{},
-    num_outputs_{},
-    leaf_size_{},
-    has_categorical_nodes_{false},
-    row_postproc_{},
-    elem_postproc_{},
-    average_factor_{},
-    bias_{},
-    postproc_constant_{} {}
+  decision_forest()
+    : nodes_{},
+      root_node_indexes_{},
+      vector_output_{},
+      categorical_storage_{},
+      num_features_{},
+      num_outputs_{},
+      leaf_size_{},
+      has_categorical_nodes_{false},
+      row_postproc_{},
+      elem_postproc_{},
+      average_factor_{},
+      bias_{},
+      postproc_constant_{}
+  {
+  }
 
   /**
    * Construct a decision forest with the indicated data
@@ -154,44 +153,41 @@ struct decision_forest {
    * operations, including sigmoid, exponential, and
    * logarithm_one_plus_exp
    */
-  decision_forest(
-    raft_proto::buffer<node_type>&& nodes,
-    raft_proto::buffer<index_type>&& root_node_indexes,
-    index_type num_features,
-    index_type num_outputs=index_type{2},
-    bool has_categorical_nodes = false,
-    std::optional<raft_proto::buffer<io_type>>&& vector_output=std::nullopt,
-    std::optional<raft_proto::buffer<typename node_type::index_type>>&& categorical_storage=std::nullopt,
-    index_type leaf_size=index_type{1},
-    row_op row_postproc=row_op::disable,
-    element_op elem_postproc=element_op::disable,
-    io_type average_factor=io_type{1},
-    io_type bias=io_type{0},
-    io_type postproc_constant=io_type{1}
-  ) :
-    nodes_{nodes},
-    root_node_indexes_{root_node_indexes},
-    vector_output_{vector_output},
-    categorical_storage_{categorical_storage},
-    num_features_{num_features},
-    num_outputs_{num_outputs},
-    leaf_size_{leaf_size},
-    has_categorical_nodes_{has_categorical_nodes},
-    row_postproc_{row_postproc},
-    elem_postproc_{elem_postproc},
-    average_factor_{average_factor},
-    bias_{bias},
-    postproc_constant_{postproc_constant}
+  decision_forest(raft_proto::buffer<node_type>&& nodes,
+                  raft_proto::buffer<index_type>&& root_node_indexes,
+                  index_type num_features,
+                  index_type num_outputs                                     = index_type{2},
+                  bool has_categorical_nodes                                 = false,
+                  std::optional<raft_proto::buffer<io_type>>&& vector_output = std::nullopt,
+                  std::optional<raft_proto::buffer<typename node_type::index_type>>&&
+                    categorical_storage     = std::nullopt,
+                  index_type leaf_size      = index_type{1},
+                  row_op row_postproc       = row_op::disable,
+                  element_op elem_postproc  = element_op::disable,
+                  io_type average_factor    = io_type{1},
+                  io_type bias              = io_type{0},
+                  io_type postproc_constant = io_type{1})
+    : nodes_{nodes},
+      root_node_indexes_{root_node_indexes},
+      vector_output_{vector_output},
+      categorical_storage_{categorical_storage},
+      num_features_{num_features},
+      num_outputs_{num_outputs},
+      leaf_size_{leaf_size},
+      has_categorical_nodes_{has_categorical_nodes},
+      row_postproc_{row_postproc},
+      elem_postproc_{elem_postproc},
+      average_factor_{average_factor},
+      bias_{bias},
+      postproc_constant_{postproc_constant}
   {
     if (nodes.memory_type() != root_node_indexes.memory_type()) {
       throw raft_proto::mem_type_mismatch(
-        "Nodes and indexes of forest must both be stored on either host or device"
-      );
+        "Nodes and indexes of forest must both be stored on either host or device");
     }
     if (nodes.device_index() != root_node_indexes.device_index()) {
       throw raft_proto::mem_type_mismatch(
-        "Nodes and indexes of forest must both be stored on same device"
-      );
+        "Nodes and indexes of forest must both be stored on same device");
     }
     detail::initialize_device<forest_type>(nodes.device());
   }
@@ -205,15 +201,12 @@ struct decision_forest {
 
   /** The number of outputs per row generated by the model for the given
    * type of inference */
-  auto num_outputs(
-    infer_kind inference_kind=infer_kind::default_kind
-  ) const {
+  auto num_outputs(infer_kind inference_kind = infer_kind::default_kind) const
+  {
     auto result = num_outputs_;
     if (inference_kind == infer_kind::per_tree) {
       result = num_trees();
-      if (has_vector_leaves()) {
-        result *= num_outputs_;
-      }
+      if (has_vector_leaves()) { result *= num_outputs_; }
     }
     return result;
   }
@@ -225,13 +218,9 @@ struct decision_forest {
   auto elem_postprocessing() const { return elem_postproc_; }
 
   /** The type of memory (device/host) where the model is stored */
-  auto memory_type() {
-    return nodes_.memory_type();
-  }
+  auto memory_type() { return nodes_.memory_type(); }
   /** The ID of the device on which this model is loaded */
-  auto device_index() {
-    return nodes_.device_index();
-  }
+  auto device_index() { return nodes_.device_index(); }
 
   /**
    * Perform inference with this model
@@ -253,65 +242,56 @@ struct decision_forest {
    * 1 to 32 is a valid value, and in general larger batches benefit from
    * larger values.
    */
-  void predict(
-    raft_proto::buffer<typename forest_type::io_type>& output,
-    raft_proto::buffer<typename forest_type::io_type> const& input,
-    raft_proto::cuda_stream stream = raft_proto::cuda_stream{},
-    infer_kind predict_type=infer_kind::default_kind,
-    std::optional<index_type> specified_rows_per_block_iter=std::nullopt
-  ) {
+  void predict(raft_proto::buffer<typename forest_type::io_type>& output,
+               raft_proto::buffer<typename forest_type::io_type> const& input,
+               raft_proto::cuda_stream stream                          = raft_proto::cuda_stream{},
+               infer_kind predict_type                                 = infer_kind::default_kind,
+               std::optional<index_type> specified_rows_per_block_iter = std::nullopt)
+  {
     if (output.memory_type() != memory_type() || input.memory_type() != memory_type()) {
       throw raft_proto::wrong_device_type{
-        "Tried to use host I/O data with model on device or vice versa"
-      };
+        "Tried to use host I/O data with model on device or vice versa"};
     }
     if (output.device_index() != device_index() || input.device_index() != device_index()) {
-      throw raft_proto::wrong_device{
-        "I/O data on different device than model"
-      };
+      throw raft_proto::wrong_device{"I/O data on different device than model"};
     }
-    auto* vector_output_data = (
-      vector_output_.has_value() ? vector_output_->data() : static_cast<io_type*>(nullptr)
-    );
-    auto* categorical_storage_data = (
-      categorical_storage_.has_value() ? categorical_storage_->data() : static_cast<categorical_storage_type*>(nullptr)
-    );
-    switch(nodes_.device().index()) {
+    auto* vector_output_data =
+      (vector_output_.has_value() ? vector_output_->data() : static_cast<io_type*>(nullptr));
+    auto* categorical_storage_data =
+      (categorical_storage_.has_value() ? categorical_storage_->data()
+                                        : static_cast<categorical_storage_type*>(nullptr));
+    switch (nodes_.device().index()) {
       case 0:
-        fil::detail::infer(
-          obj(),
-          get_postprocessor(predict_type),
-          output.data(),
-          input.data(),
-          index_type(input.size() / num_features_),
-          num_features_,
-          num_outputs(predict_type),
-          has_categorical_nodes_,
-          vector_output_data,
-          categorical_storage_data,
-          predict_type,
-          specified_rows_per_block_iter,
-          std::get<0>(nodes_.device()),
-          stream
-        );
+        fil::detail::infer(obj(),
+                           get_postprocessor(predict_type),
+                           output.data(),
+                           input.data(),
+                           index_type(input.size() / num_features_),
+                           num_features_,
+                           num_outputs(predict_type),
+                           has_categorical_nodes_,
+                           vector_output_data,
+                           categorical_storage_data,
+                           predict_type,
+                           specified_rows_per_block_iter,
+                           std::get<0>(nodes_.device()),
+                           stream);
         break;
       case 1:
-        fil::detail::infer(
-          obj(),
-          get_postprocessor(predict_type),
-          output.data(),
-          input.data(),
-          index_type(input.size() / num_features_),
-          num_features_,
-          num_outputs(predict_type),
-          has_categorical_nodes_,
-          vector_output_data,
-          categorical_storage_data,
-          predict_type,
-          specified_rows_per_block_iter,
-          std::get<1>(nodes_.device()),
-          stream
-        );
+        fil::detail::infer(obj(),
+                           get_postprocessor(predict_type),
+                           output.data(),
+                           input.data(),
+                           index_type(input.size() / num_features_),
+                           num_features_,
+                           num_outputs(predict_type),
+                           has_categorical_nodes_,
+                           vector_output_data,
+                           categorical_storage_data,
+                           predict_type,
+                           specified_rows_per_block_iter,
+                           std::get<1>(nodes_.device()),
+                           stream);
         break;
     }
   }
@@ -339,27 +319,20 @@ struct decision_forest {
   io_type bias_;
   io_type postproc_constant_;
 
-  auto obj() const {
-    return forest_type{
-      nodes_.data(),
-      root_node_indexes_.data(),
-      static_cast<index_type>(root_node_indexes_.size()),
-      num_outputs_
-    };
+  auto obj() const
+  {
+    return forest_type{nodes_.data(),
+                       root_node_indexes_.data(),
+                       static_cast<index_type>(root_node_indexes_.size()),
+                       num_outputs_};
   }
 
-  auto get_postprocessor(
-    infer_kind inference_kind=infer_kind::default_kind
-  ) const {
+  auto get_postprocessor(infer_kind inference_kind = infer_kind::default_kind) const
+  {
     auto result = postprocessor_type{};
     if (inference_kind == infer_kind::default_kind) {
-      result = postprocessor_type {
-        row_postproc_,
-        elem_postproc_,
-        average_factor_,
-        bias_,
-        postproc_constant_
-      };
+      result = postprocessor_type{
+        row_postproc_, elem_postproc_, average_factor_, bias_, postproc_constant_};
     }
     return result;
   }
@@ -381,64 +354,50 @@ namespace detail {
  * @tparam large_trees Whether this forest expects more than 2**(16 -3) - 1 =
  * 8191 features or contains nodes whose child is offset more than 2**16 - 1 = 65535 nodes away.
  */
-template<
-  tree_layout layout,
-  bool double_precision,
-  bool large_trees
->
+template <tree_layout layout, bool double_precision, bool large_trees>
 using preset_decision_forest = decision_forest<
   layout,
   typename specialization_types<layout, double_precision, large_trees>::threshold_type,
   typename specialization_types<layout, double_precision, large_trees>::index_type,
   typename specialization_types<layout, double_precision, large_trees>::metadata_type,
-  typename specialization_types<layout, double_precision, large_trees>::offset_type
->;
+  typename specialization_types<layout, double_precision, large_trees>::offset_type>;
 
-}
+}  // namespace detail
 
 /** A variant containing all standard decision_forest instantiations */
-using decision_forest_variant = std::variant<
-  detail::preset_decision_forest<
-    std::variant_alternative_t<0, detail::specialization_variant>::layout,
-    std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<1, detail::specialization_variant>::layout,
-    std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<2, detail::specialization_variant>::layout,
-    std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<3, detail::specialization_variant>::layout,
-    std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<4, detail::specialization_variant>::layout,
-    std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<5, detail::specialization_variant>::layout,
-    std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<6, detail::specialization_variant>::layout,
-    std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees
-  >,
-  detail::preset_decision_forest<
-    std::variant_alternative_t<7, detail::specialization_variant>::layout,
-    std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
-    std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees
-  >
->;
+using decision_forest_variant =
+  std::variant<detail::preset_decision_forest<
+                 std::variant_alternative_t<0, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<1, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<2, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<3, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<4, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<5, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<6, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees>,
+               detail::preset_decision_forest<
+                 std::variant_alternative_t<7, detail::specialization_variant>::layout,
+                 std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
+                 std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees>>;
 
 /**
  * Determine the variant index of the decision_forest type to used based on
@@ -458,50 +417,43 @@ using decision_forest_variant = std::variant<
  * models, this should be the total number of leaf nodes.
  * @param layout The in-memory layout to be used for nodes in the forest
  */
-inline auto get_forest_variant_index(
-  bool use_double_thresholds,
-  index_type max_node_offset,
-  index_type num_features,
-  index_type num_categorical_nodes = index_type{},
-  index_type max_num_categories = index_type{},
-  index_type num_vector_leaves = index_type{},
-  tree_layout layout = preferred_tree_layout
-) {
-  using small_index_t = typename detail::specialization_types<preferred_tree_layout, false, false>::index_type;
+inline auto get_forest_variant_index(bool use_double_thresholds,
+                                     index_type max_node_offset,
+                                     index_type num_features,
+                                     index_type num_categorical_nodes = index_type{},
+                                     index_type max_num_categories    = index_type{},
+                                     index_type num_vector_leaves     = index_type{},
+                                     tree_layout layout               = preferred_tree_layout)
+{
+  using small_index_t =
+    typename detail::specialization_types<preferred_tree_layout, false, false>::index_type;
   auto max_local_categories = index_type(sizeof(small_index_t) * 8);
   // If the index required for pointing to categorical storage bins or vector
   // leaf output exceeds what we can store in a uint32_t, uint64_t will be used
   //
   // TODO(wphicks): We are overestimating categorical storage required here
-  auto double_indexes_required = (
-    max_num_categories > max_local_categories
-    && (
-      (
-        raft_proto::ceildiv(max_num_categories, max_local_categories) + 1
-        * num_categorical_nodes
-      ) > std::numeric_limits<small_index_t>::max()
-    )
-  ) || num_vector_leaves > std::numeric_limits<small_index_t>::max();
+  auto double_indexes_required =
+    (max_num_categories > max_local_categories &&
+     ((raft_proto::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) >
+      std::numeric_limits<small_index_t>::max())) ||
+    num_vector_leaves > std::numeric_limits<small_index_t>::max();
 
   auto double_precision = use_double_thresholds || double_indexes_required;
 
-  using small_metadata_t = typename detail::specialization_types<preferred_tree_layout, false, false>::metadata_type;
-  using small_offset_t = typename detail::specialization_types<preferred_tree_layout, false, false>::offset_type;
+  using small_metadata_t =
+    typename detail::specialization_types<preferred_tree_layout, false, false>::metadata_type;
+  using small_offset_t =
+    typename detail::specialization_types<preferred_tree_layout, false, false>::offset_type;
 
-  auto large_trees = (
-    num_features > (
-      std::numeric_limits<small_metadata_t>::max() >> reserved_node_metadata_bits
-    ) || max_node_offset > std::numeric_limits<small_offset_t>::max()
-  );
+  auto large_trees =
+    (num_features > (std::numeric_limits<small_metadata_t>::max() >> reserved_node_metadata_bits) ||
+     max_node_offset > std::numeric_limits<small_offset_t>::max());
 
   auto layout_value = static_cast<std::underlying_type_t<tree_layout>>(layout);
 
-  return (
-    (index_type{layout_value} << index_type{2})
-    + (index_type{double_precision} << index_type{1})
-    + index_type{large_trees}
-  );
-}
-}
-}
+  return ((index_type{layout_value} << index_type{2}) +
+          (index_type{double_precision} << index_type{1}) + index_type{large_trees});
 }
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/bitset.hpp b/cpp/include/cuml/experimental/fil/detail/bitset.hpp
index a7ac8e9fda..6a5b65927b 100644
--- a/cpp/include/cuml/experimental/fil/detail/bitset.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/bitset.hpp
@@ -18,65 +18,54 @@
 #ifndef __CUDACC__
 #include <math.h>
 #endif
-#include <variant>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 #include <stddef.h>
 #include <type_traits>
 #include <variant>
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 
 namespace ML {
 namespace experimental {
 namespace fil {
 namespace detail {
-template<typename index_t=size_t, typename storage_t=std::byte>
+template <typename index_t = size_t, typename storage_t = std::byte>
 struct bitset {
   using storage_type = storage_t;
-  using index_type = index_t;
+  using index_type   = index_t;
 
-  auto constexpr static const bin_width = index_type(
-    sizeof(storage_type) * 8
-  );
+  auto constexpr static const bin_width = index_type(sizeof(storage_type) * 8);
 
-  HOST DEVICE bitset()
-    : data_{nullptr}, num_bits_{0}
-  {
-  }
+  HOST DEVICE bitset() : data_{nullptr}, num_bits_{0} {}
 
-  HOST DEVICE bitset(storage_type* data, index_type size)
-    : data_{data}, num_bits_{size}
-  {
-  }
+  HOST DEVICE bitset(storage_type* data, index_type size) : data_{data}, num_bits_{size} {}
 
-  HOST DEVICE bitset(storage_type* data)
-    : data_{data}, num_bits_(sizeof(storage_type) * 8)
-  {
-  }
+  HOST DEVICE bitset(storage_type* data) : data_{data}, num_bits_(sizeof(storage_type) * 8) {}
 
-  HOST DEVICE auto size() const {
-    return num_bits_;
-  }
-  HOST DEVICE auto bin_count() const {
+  HOST DEVICE auto size() const { return num_bits_; }
+  HOST DEVICE auto bin_count() const
+  {
     return num_bits_ / bin_width + (num_bits_ % bin_width != 0);
   }
 
   // Standard bit-wise mutators and accessor
-  HOST DEVICE auto& set(index_type index) {
+  HOST DEVICE auto& set(index_type index)
+  {
     data_[bin_from_index(index)] |= mask_in_bin(index);
     return *this;
   }
-  HOST DEVICE auto& clear(index_type index) {
+  HOST DEVICE auto& clear(index_type index)
+  {
     data_[bin_from_index(index)] &= ~mask_in_bin(index);
     return *this;
   }
-  HOST DEVICE auto test(index_type index) const {
+  HOST DEVICE auto test(index_type index) const
+  {
     auto result = false;
-    if (index < num_bits_) {
-      result = ((data_[bin_from_index(index)] & mask_in_bin(index)) != 0);
-    }
+    if (index < num_bits_) { result = ((data_[bin_from_index(index)] & mask_in_bin(index)) != 0); }
     return result;
   }
-  HOST DEVICE auto& flip() {
+  HOST DEVICE auto& flip()
+  {
     for (auto i = index_type{}; i < bin_count(); ++i) {
       data_[i] = ~data_[i];
     }
@@ -84,25 +73,29 @@ struct bitset {
   }
 
   // Bit-wise boolean operations
-  HOST DEVICE auto& operator&=(bitset<storage_type> const& other) {
+  HOST DEVICE auto& operator&=(bitset<storage_type> const& other)
+  {
     for (auto i = index_type{}; i < min(size(), other.size()); ++i) {
       data_[i] &= other.data_[i];
     }
     return *this;
   }
-  HOST DEVICE auto& operator|=(bitset<storage_type> const& other) {
+  HOST DEVICE auto& operator|=(bitset<storage_type> const& other)
+  {
     for (auto i = index_type{}; i < min(size(), other.size()); ++i) {
       data_[i] |= other.data_[i];
     }
     return *this;
   }
-  HOST DEVICE auto& operator^=(bitset<storage_type> const& other) {
+  HOST DEVICE auto& operator^=(bitset<storage_type> const& other)
+  {
     for (auto i = index_type{}; i < min(size(), other.size()); ++i) {
       data_[i] ^= other.data_[i];
     }
     return *this;
   }
-  HOST DEVICE auto& operator~() const {
+  HOST DEVICE auto& operator~() const
+  {
     flip();
     return *this;
   }
@@ -111,16 +104,15 @@ struct bitset {
   storage_type* data_;
   index_type num_bits_;
 
-  HOST DEVICE auto mask_in_bin(index_type index) const {
+  HOST DEVICE auto mask_in_bin(index_type index) const
+  {
     return storage_type{1} << (index % bin_width);
   }
 
-  HOST DEVICE auto bin_from_index(index_type index) const {
-    return index / bin_width;
-  }
+  HOST DEVICE auto bin_from_index(index_type index) const { return index / bin_width; }
 };
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/cpu_introspection.hpp b/cpp/include/cuml/experimental/fil/detail/cpu_introspection.hpp
index ef7cb823ac..8a98a66327 100644
--- a/cpp/include/cuml/experimental/fil/detail/cpu_introspection.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/cpu_introspection.hpp
@@ -24,9 +24,9 @@ namespace detail {
 #ifdef __cpplib_hardware_interference_size
 using std::hardware_constructive_interference_size;
 #else
-auto constexpr static const hardware_constructive_interference_size=std::size_t{64};
+auto constexpr static const hardware_constructive_interference_size = std::size_t{64};
 #endif
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp b/cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp
index 7e4005f150..f2ffd99cb2 100644
--- a/cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp
@@ -17,19 +17,19 @@
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
-#include <stdint.h>
-#include <numeric>
-#include <optional>
-#include <vector>
-#include <cuml/experimental/fil/postproc_ops.hpp>
+#include <cuml/experimental/fil/detail/bitset.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
-#include <cuml/experimental/fil/exceptions.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/buffer.hpp>
-#include <cuml/experimental/fil/detail/bitset.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/ceildiv.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/exceptions.hpp>
+#include <cuml/experimental/fil/postproc_ops.hpp>
+#include <numeric>
+#include <optional>
+#include <stdint.h>
+#include <vector>
 
 namespace ML {
 namespace experimental {
@@ -51,14 +51,14 @@ struct model_builder_error : std::exception {
 /*
  * Struct used to build FIL forests
  */
-template<typename decision_forest_t>
+template <typename decision_forest_t>
 struct decision_forest_builder {
-
   /* The type for nodes in the given decision_forest type */
   using node_type = typename decision_forest_t::node_type;
 
   /* Add a root node, indicating the beginning of a new tree */
-  void start_new_tree() {
+  void start_new_tree()
+  {
     if (root_node_indexes_.empty()) {
       root_node_indexes_.emplace_back();
     } else {
@@ -71,90 +71,67 @@ struct decision_forest_builder {
           }
         }
       }
-      root_node_indexes_.push_back(
-        root_node_indexes_.back() + cur_tree_size_
-      );
+      root_node_indexes_.push_back(root_node_indexes_.back() + cur_tree_size_);
       cur_tree_size_ = index_type{};
     }
   }
 
   /* Add a node with a categorical split */
-  template<typename iter_t>
+  template <typename iter_t>
   void add_categorical_node(
     iter_t vec_begin,
     iter_t vec_end,
-    bool default_to_distant_child=false,
+    bool default_to_distant_child                     = false,
     typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
-    typename node_type::offset_type offset = typename node_type::offset_type{}
-  ) {
+    typename node_type::offset_type offset            = typename node_type::offset_type{})
+  {
     auto constexpr const bin_width = index_type(sizeof(typename node_type::index_type) * 8);
-    auto node_value = typename node_type::index_type{};
-    auto set_storage = &node_value;
-    auto max_node_categories = *std::max_element(vec_begin, vec_end) + 1;
+    auto node_value                = typename node_type::index_type{};
+    auto set_storage               = &node_value;
+    auto max_node_categories       = *std::max_element(vec_begin, vec_end) + 1;
     if (max_num_categories_ > bin_width) {
       // TODO(wphicks): Check for overflow here
-      node_value = categorical_storage_.size();
+      node_value         = categorical_storage_.size();
       auto bins_required = raft_proto::ceildiv(max_node_categories, bin_width);
       categorical_storage_.push_back(max_node_categories);
       categorical_storage_.resize(categorical_storage_.size() + bins_required);
       set_storage = &(categorical_storage_[node_value + 1]);
     }
     auto set = bitset{set_storage, max_node_categories};
-    std::for_each(
-      vec_begin,
-      vec_end, 
-      [&set](auto&& cat_index) {
-        set.set(cat_index);
-      }
-    );
+    std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
 
-    add_node(
-      node_value,
-      false,
-      default_to_distant_child,
-      true,
-      feature,
-      offset,
-      false
-    );
+    add_node(node_value, false, default_to_distant_child, true, feature, offset, false);
   }
 
   /* Add a leaf node with vector output */
-  template<typename iter_t>
-  void add_leaf_vector_node(
-    iter_t vec_begin,
-    iter_t vec_end
-  ) {
+  template <typename iter_t>
+  void add_leaf_vector_node(iter_t vec_begin, iter_t vec_end)
+  {
     auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
     std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
-    nodes_.emplace_back(
-      leaf_index,
-      true,
-      false,
-      false,
-      typename node_type::metadata_storage_type{},
-      typename node_type::offset_type{}
-    );
+    nodes_.emplace_back(leaf_index,
+                        true,
+                        false,
+                        false,
+                        typename node_type::metadata_storage_type{},
+                        typename node_type::offset_type{});
     ++cur_tree_size_;
   }
 
   /* Add a node to the model */
-  template<typename value_t>
+  template <typename value_t>
   void add_node(
     value_t val,
-    bool is_leaf_node=true,
-    bool default_to_distant_child=false,
-    bool is_categorical_node=false,
+    bool is_leaf_node                                 = true,
+    bool default_to_distant_child                     = false,
+    bool is_categorical_node                          = false,
     typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
-    typename node_type::offset_type offset = typename node_type::offset_type{},
-    bool is_inclusive=false
-  ) {
-    if (is_inclusive) {
-      val = std::nextafter(val, std::numeric_limits<value_t>::infinity());
-    }
+    typename node_type::offset_type offset            = typename node_type::offset_type{},
+    bool is_inclusive                                 = false)
+  {
+    if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
     nodes_.emplace_back(
-      val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset
-    );
+      val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
     ++cur_tree_size_;
   }
 
@@ -170,41 +147,39 @@ struct decision_forest_builder {
    * (if any) */
   void set_postproc_constant(double val) { postproc_constant_ = val; }
   /* Set the number of outputs per row for this model */
-  void set_output_size(index_type val) {
+  void set_output_size(index_type val)
+  {
     if (output_size_ != index_type{1} && output_size_ != val) {
       throw model_import_error("Inconsistent leaf vector size");
     }
     output_size_ = val;
   }
 
-  decision_forest_builder(
-      index_type max_num_categories=index_type{},
-      index_type align_bytes=index_type{}
-    ) :
-    cur_tree_size_{},
-    max_num_categories_{max_num_categories},
-    alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
-    output_size_{1},
-    element_postproc_{},
-    average_factor_{},
-    row_postproc_{},
-    bias_{},
-    postproc_constant_{},
-    max_tree_size_{},
-    nodes_{},
-    root_node_indexes_{},
-    vector_output_{} {
+  decision_forest_builder(index_type max_num_categories = index_type{},
+                          index_type align_bytes        = index_type{})
+    : cur_tree_size_{},
+      max_num_categories_{max_num_categories},
+      alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
+      output_size_{1},
+      element_postproc_{},
+      average_factor_{},
+      row_postproc_{},
+      bias_{},
+      postproc_constant_{},
+      max_tree_size_{},
+      nodes_{},
+      root_node_indexes_{},
+      vector_output_{}
+  {
   }
 
   /* Return the FIL decision forest built by this builder */
-  auto get_decision_forest(
-      index_type num_feature,
-      index_type num_class,
-      raft_proto::device_type mem_type=raft_proto::device_type::cpu,
-      int device=0,
-      raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-  ) {
-
+  auto get_decision_forest(index_type num_feature,
+                           index_type num_class,
+                           raft_proto::device_type mem_type = raft_proto::device_type::cpu,
+                           int device                       = 0,
+                           raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+  {
     // Allow narrowing for preprocessing constants. They are stored as doubles
     // for consistency in the builder but must be converted to the proper types
     // for the concrete forest model.
@@ -212,47 +187,37 @@ struct decision_forest_builder {
 #pragma GCC diagnostic ignored "-Wnarrowing"
     return decision_forest_t{
       raft_proto::buffer{
-        raft_proto::buffer{nodes_.data(), nodes_.size()},
-        mem_type,
-        device,
-        stream
-      },
-      raft_proto::buffer{
-        raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
-        mem_type,
-        device,
-        stream
-      },
+        raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
+      raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
+                         mem_type,
+                         device,
+                         stream},
       num_feature,
       num_class,
       max_num_categories_ != 0,
-      vector_output_.empty() ?
-        std::nullopt :
-        std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
-          raft_proto::buffer{vector_output_.data(), vector_output_.size()},
-          mem_type,
-          device,
-          stream
-        ),
-      categorical_storage_.empty() ?
-        std::nullopt :
-        std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
-          raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
-          mem_type,
-          device,
-          stream
-        ),
+      vector_output_.empty()
+        ? std::nullopt
+        : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
+            raft_proto::buffer{vector_output_.data(), vector_output_.size()},
+            mem_type,
+            device,
+            stream),
+      categorical_storage_.empty()
+        ? std::nullopt
+        : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
+            raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
+            mem_type,
+            device,
+            stream),
       output_size_,
       row_postproc_,
       element_postproc_,
       static_cast<typename node_type::threshold_type>(average_factor_),
       static_cast<typename node_type::threshold_type>(bias_),
-      static_cast<typename node_type::threshold_type>(postproc_constant_)
-    };
+      static_cast<typename node_type::threshold_type>(postproc_constant_)};
 #pragma GCC diagnostic pop
   }
 
-
  private:
   index_type cur_tree_size_;
   index_type max_num_categories_;
@@ -271,7 +236,7 @@ struct decision_forest_builder {
   std::vector<typename node_type::index_type> categorical_storage_;
 };
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/device_initialization.hpp b/cpp/include/cuml/experimental/fil/detail/device_initialization.hpp
index 51a87460b1..eaaa570819 100644
--- a/cpp/include/cuml/experimental/fil/detail/device_initialization.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/device_initialization.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <variant>
 #include <cuml/experimental/fil/detail/device_initialization/cpu.hpp>
+#include <variant>
 #ifdef CUML_ENABLE_GPU
 #include <cuml/experimental/fil/detail/device_initialization/gpu.hpp>
 #endif
@@ -26,19 +26,23 @@ namespace experimental {
 namespace fil {
 namespace detail {
 /* Set any required device options for optimizing FIL compute */
-template<typename forest_t, raft_proto::device_type D>
-void initialize_device(raft_proto::device_id<D> device) {
+template <typename forest_t, raft_proto::device_type D>
+void initialize_device(raft_proto::device_id<D> device)
+{
   device_initialization::initialize_device<forest_t>(device);
 }
 
 /* Set any required device options for optimizing FIL compute */
-template<typename forest_t>
-void initialize_device(raft_proto::device_id_variant device) {
-  std::visit([](auto&& concrete_device) {
-    device_initialization::initialize_device<forest_t>(concrete_device);
-  }, device);
-}
-}
-}
-}
+template <typename forest_t>
+void initialize_device(raft_proto::device_id_variant device)
+{
+  std::visit(
+    [](auto&& concrete_device) {
+      device_initialization::initialize_device<forest_t>(concrete_device);
+    },
+    device);
 }
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/device_initialization/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/device_initialization/cpu.hpp
index 2bcf7a4c4c..2735aa2978 100644
--- a/cpp/include/cuml/experimental/fil/detail/device_initialization/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/device_initialization/cpu.hpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 #pragma once
-#include <type_traits>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <type_traits>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -29,11 +29,16 @@ namespace device_initialization {
  * This specialization will also be used for non-GPU-enabled builds
  * (as a GPU no-op).
  */
-template<typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::disjunction_v<std::bool_constant<!raft_proto::GPU_ENABLED>, std::bool_constant<D == raft_proto::device_type::cpu>>, void> initialize_device(raft_proto::device_id<D> device) {}
-
-}
-}
-}
-}
+template <typename forest_t, raft_proto::device_type D>
+std::enable_if_t<std::disjunction_v<std::bool_constant<!raft_proto::GPU_ENABLED>,
+                                    std::bool_constant<D == raft_proto::device_type::cpu>>,
+                 void>
+initialize_device(raft_proto::device_id<D> device)
+{
 }
+
+}  // namespace device_initialization
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh b/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh
index acdd19f8c3..90fab16fff 100644
--- a/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh
+++ b/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 #pragma once
-#include <type_traits>
 #include <cuda_runtime_api.h>
 #include <cuml/experimental/fil/constants.hpp>
+#include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/gpu_introspection.hpp>
 #include <cuml/experimental/fil/detail/infer_kernel/gpu.cuh>
-#include <cuml/experimental/fil/detail/forest.hpp>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_setter.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
+#include <type_traits>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -37,306 +37,197 @@ namespace device_initialization {
  * the inference kernels have access to the maximum available dynamic shared
  * memory.
  */
-template<typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>, std::bool_constant<D==raft_proto::device_type::gpu>>, void> initialize_device(raft_proto::device_id<D> device) {
-  auto device_context = raft_proto::device_setter(device);
+template <typename forest_t, raft_proto::device_type D>
+std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>,
+                                    std::bool_constant<D == raft_proto::device_type::gpu>>,
+                 void>
+initialize_device(raft_proto::device_id<D> device)
+{
+  auto device_context           = raft_proto::device_setter(device);
   auto max_shared_mem_per_block = get_max_shared_mem_per_block(device);
   // Run solely for side-effect of caching SM count
   get_sm_count(device);
   raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 1, forest_t, std::nullptr_t, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 2, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 4, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 8, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 16, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 32, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 1, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 2, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 4, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 8, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 16, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<false, 32, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 1, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 2, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 4, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 8, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 16, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 32, forest_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 1, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 2, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 4, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 8, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 16, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 32, forest_t, typename forest_t::io_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 1, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 2, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 4, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 8, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 16, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 32, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 1, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 2, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 4, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 8, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 16, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 32, forest_t, typename forest_t::io_type*, std::nullptr_t>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 1, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 2, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 4, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 8, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 16, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(
-      infer_kernel<true, 32, forest_t, typename forest_t::io_type*, typename forest_t::node_type::index_type*>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      max_shared_mem_per_block
-    )
-  );
+    cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, std::nullptr_t, std::nullptr_t>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 2, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 4, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 8, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 16, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 32, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 2, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 4, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 8, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 16, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<false, 32, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 1, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 2, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 4, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 8, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 16, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 32, forest_t>,
+                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                              max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 1, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 2, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 4, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 8, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 16, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true, 32, forest_t, typename forest_t::io_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 1, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 2, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 4, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 8, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 16, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 32, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 1, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 2, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 4, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 8, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 16, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(cudaFuncSetAttribute(
+    infer_kernel<true, 32, forest_t, typename forest_t::io_type*, std::nullptr_t>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      1,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      2,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      4,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      8,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      16,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
+  raft_proto::cuda_check(
+    cudaFuncSetAttribute(infer_kernel<true,
+                                      32,
+                                      forest_t,
+                                      typename forest_t::io_type*,
+                                      typename forest_t::node_type::index_type*>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         max_shared_mem_per_block));
 }
 
 CUML_FIL_INITIALIZE_DEVICE(extern template, 0)
@@ -348,9 +239,9 @@ CUML_FIL_INITIALIZE_DEVICE(extern template, 5)
 CUML_FIL_INITIALIZE_DEVICE(extern template, 6)
 CUML_FIL_INITIALIZE_DEVICE(extern template, 7)
 
-}
-}
-}
+}  // namespace device_initialization
+}  // namespace detail
+}  // namespace fil
 
-}
-}
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.hpp
index ca6090200a..22e5a2a7c8 100644
--- a/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.hpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 #pragma once
-#include <type_traits>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_setter.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <type_traits>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -28,12 +28,15 @@ namespace device_initialization {
 /* Non-CUDA header declaration of the GPU specialization for device
  * initialization
  */
-template<typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>, std::bool_constant<D==raft_proto::device_type::gpu>>, void> initialize_device(raft_proto::device_id<D> device);
+template <typename forest_t, raft_proto::device_type D>
+std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>,
+                                    std::bool_constant<D == raft_proto::device_type::gpu>>,
+                 void>
+initialize_device(raft_proto::device_id<D> device);
 
-}
-}
-}
+}  // namespace device_initialization
+}  // namespace detail
+}  // namespace fil
 
-}
-}
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/evaluate_tree.hpp b/cpp/include/cuml/experimental/fil/detail/evaluate_tree.hpp
index e9b8a87ebc..4e0b1a04e3 100644
--- a/cpp/include/cuml/experimental/fil/detail/evaluate_tree.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/evaluate_tree.hpp
@@ -37,27 +37,18 @@ namespace detail {
  * @param node Pointer to the root node of this tree
  * @param row Pointer to the input data for this row
  */
-template<
-  bool has_vector_leaves,
-  bool has_categorical_nodes,
-  typename node_t,
-  typename io_t
->
-HOST DEVICE auto evaluate_tree(
-    node_t const* __restrict__ node,
-    io_t const* __restrict__ row
-) {
+template <bool has_vector_leaves, bool has_categorical_nodes, typename node_t, typename io_t>
+HOST DEVICE auto evaluate_tree(node_t const* __restrict__ node, io_t const* __restrict__ row)
+{
   using categorical_set_type = bitset<uint32_t, typename node_t::index_type const>;
-  auto cur_node = *node;
+  auto cur_node              = *node;
   do {
     auto input_val = row[cur_node.feature_index()];
     auto condition = true;
     if constexpr (has_categorical_nodes) {
       if (cur_node.is_categorical()) {
         auto valid_categories = categorical_set_type{
-          &cur_node.index(),
-          uint32_t(sizeof(typename node_t::index_type) * 8)
-        };
+          &cur_node.index(), uint32_t(sizeof(typename node_t::index_type) * 8)};
         condition = valid_categories.test(input_val);
       } else {
         condition = (input_val < cur_node.threshold());
@@ -65,9 +56,7 @@ HOST DEVICE auto evaluate_tree(
     } else {
       condition = (input_val < cur_node.threshold());
     }
-    if (!condition && cur_node.default_distant()) {
-      condition = isnan(input_val);
-    }
+    if (!condition && cur_node.default_distant()) { condition = isnan(input_val); }
     node += cur_node.child_offset(condition);
     cur_node = *node;
   } while (!cur_node.is_leaf());
@@ -97,28 +86,21 @@ HOST DEVICE auto evaluate_tree(
  * @param categorical_storage Pointer to where categorical split data is
  * stored.
  */
-template<
-  bool has_vector_leaves,
-  typename node_t,
-  typename io_t,
-  typename categorical_storage_t
->
-HOST DEVICE auto evaluate_tree(
-    node_t const* __restrict__ node,
-    io_t const* __restrict__ row,
-    categorical_storage_t const* __restrict__ categorical_storage
-) {
+template <bool has_vector_leaves, typename node_t, typename io_t, typename categorical_storage_t>
+HOST DEVICE auto evaluate_tree(node_t const* __restrict__ node,
+                               io_t const* __restrict__ row,
+                               categorical_storage_t const* __restrict__ categorical_storage)
+{
   using categorical_set_type = bitset<uint32_t, categorical_storage_t const>;
-  auto cur_node = *node;
+  auto cur_node              = *node;
   do {
     auto input_val = row[cur_node.feature_index()];
     auto condition = cur_node.default_distant();
     if (!isnan(input_val)) {
       if (cur_node.is_categorical()) {
-        auto valid_categories = categorical_set_type{
-          categorical_storage + cur_node.index() + 1,
-          uint32_t(categorical_storage[cur_node.index()])
-        };
+        auto valid_categories =
+          categorical_set_type{categorical_storage + cur_node.index() + 1,
+                               uint32_t(categorical_storage[cur_node.index()])};
         condition = valid_categories.test(input_val);
       } else {
         condition = (input_val < cur_node.threshold());
@@ -130,7 +112,7 @@ HOST DEVICE auto evaluate_tree(
   return cur_node.template output<has_vector_leaves>();
 }
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/forest.hpp b/cpp/include/cuml/experimental/fil/detail/forest.hpp
index a8a741a661..e031a336b5 100644
--- a/cpp/include/cuml/experimental/fil/detail/forest.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/forest.hpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 #pragma once
-#include <stddef.h>
-#include <type_traits>
-#include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/node.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <stddef.h>
+#include <type_traits>
 
 namespace ML {
 namespace experimental {
@@ -26,35 +26,42 @@ namespace fil {
 
 /* A collection of trees which together form a forest model
  */
-template <tree_layout layout_v, typename threshold_t, typename index_t, typename metadata_storage_t, typename offset_t>
+template <tree_layout layout_v,
+          typename threshold_t,
+          typename index_t,
+          typename metadata_storage_t,
+          typename offset_t>
 struct forest {
   using node_type = node<layout_v, threshold_t, index_t, metadata_storage_t, offset_t>;
-  using io_type = threshold_t;
+  using io_type   = threshold_t;
   template <typename vector_output_t>
-  using raw_output_type = std::conditional_t<
-      !std::is_same_v<vector_output_t, std::nullptr_t>,
-      std::remove_pointer_t<vector_output_t>,
-      typename node_type::threshold_type
-  >;
+  using raw_output_type = std::conditional_t<!std::is_same_v<vector_output_t, std::nullptr_t>,
+                                             std::remove_pointer_t<vector_output_t>,
+                                             typename node_type::threshold_type>;
 
-  HOST DEVICE forest(node_type* forest_nodes, index_type* forest_root_indexes, index_type num_trees, index_type num_outputs) :
-    nodes_{forest_nodes}, root_node_indexes_{forest_root_indexes}, num_trees_{num_trees}, num_outputs_{num_outputs} {}
+  HOST DEVICE forest(node_type* forest_nodes,
+                     index_type* forest_root_indexes,
+                     index_type num_trees,
+                     index_type num_outputs)
+    : nodes_{forest_nodes},
+      root_node_indexes_{forest_root_indexes},
+      num_trees_{num_trees},
+      num_outputs_{num_outputs}
+  {
+  }
 
   /* Return pointer to the root node of the indicated tree */
-  HOST DEVICE auto* get_tree_root(index_type tree_index) const {
+  HOST DEVICE auto* get_tree_root(index_type tree_index) const
+  {
     return nodes_ + root_node_indexes_[tree_index];
   }
 
   /* Return the number of trees in this forest */
-  HOST DEVICE auto tree_count() const {
-    return num_trees_;
-  }
+  HOST DEVICE auto tree_count() const { return num_trees_; }
 
   /* Return the number of outputs per row for default evaluation of this
    * forest */
-  HOST DEVICE auto num_outputs() const {
-    return num_outputs_;
-  }
+  HOST DEVICE auto num_outputs() const { return num_outputs_; }
 
  private:
   node_type* nodes_;
@@ -63,6 +70,6 @@ struct forest {
   index_type num_outputs_;
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/gpu_introspection.hpp b/cpp/include/cuml/experimental/fil/detail/gpu_introspection.hpp
index 4fbfd8422b..ded753d682 100644
--- a/cpp/include/cuml/experimental/fil/detail/gpu_introspection.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/gpu_introspection.hpp
@@ -26,103 +26,79 @@ namespace experimental {
 namespace fil {
 namespace detail {
 
-inline auto get_max_shared_mem_per_block(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_max_shared_mem_per_block(
+  raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
     raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
-    for (auto dev=0; dev < device_count; ++dev) {
+    for (auto dev = 0; dev < device_count; ++dev) {
       raft_proto::cuda_check(
-        cudaDeviceGetAttribute(
-          &(cache[dev]),
-          cudaDevAttrMaxSharedMemoryPerBlockOptin,
-          dev
-        )
-      );
+        cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_sm_count(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_sm_count(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
     raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
-    for (auto dev=0; dev < device_count; ++dev) {
+    for (auto dev = 0; dev < device_count; ++dev) {
       raft_proto::cuda_check(
-        cudaDeviceGetAttribute(
-          &(cache[dev]),
-          cudaDevAttrMultiProcessorCount,
-          dev
-        )
-      );
+        cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_max_threads_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_max_threads_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto result = int{};
   raft_proto::cuda_check(
-    cudaDeviceGetAttribute(
-      &result,
-      cudaDevAttrMaxThreadsPerMultiProcessor,
-      device_id.value()
-    )
-  );
+    cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
   return index_type(result);
 }
 
-inline auto get_max_shared_mem_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_max_shared_mem_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
     raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
-    for (auto dev=0; dev < device_count; ++dev) {
+    for (auto dev = 0; dev < device_count; ++dev) {
       raft_proto::cuda_check(
-        cudaDeviceGetAttribute(
-          &(cache[dev]),
-          cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-          dev
-        )
-      );
+        cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_mem_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_mem_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto result = int{};
   raft_proto::cuda_check(
-    cudaDeviceGetAttribute(
-      &result,
-      cudaDevAttrMemoryClockRate,
-      device_id.value()
-    )
-  );
+    cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
   return index_type(result);
 }
 
-inline auto get_core_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id) {
+inline auto get_core_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+{
   auto result = int{};
-  raft_proto::cuda_check(
-    cudaDeviceGetAttribute(
-      &result,
-      cudaDevAttrClockRate,
-      device_id.value()
-    )
-  );
+  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
   return index_type(result);
 }
 
 /* The maximum number of bytes that can be read in a single instruction */
-auto constexpr static const MAX_READ_CHUNK = index_type{128};
-auto constexpr static const MAX_BLOCKS = index_type{65536};
-auto constexpr static const WARP_SIZE = index_type{32};
+auto constexpr static const MAX_READ_CHUNK        = index_type{128};
+auto constexpr static const MAX_BLOCKS            = index_type{65536};
+auto constexpr static const WARP_SIZE             = index_type{32};
 auto constexpr static const MAX_THREADS_PER_BLOCK = index_type{256};
 #ifdef __CUDACC__
 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
@@ -136,7 +112,7 @@ auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
 
 auto constexpr static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/index_type.hpp b/cpp/include/cuml/experimental/fil/detail/index_type.hpp
index 353f9bea82..fa2a6da187 100644
--- a/cpp/include/cuml/experimental/fil/detail/index_type.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/index_type.hpp
@@ -20,5 +20,5 @@ namespace experimental {
 namespace fil {
 using index_type = uint32_t;
 }
-}
-}
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer.hpp b/cpp/include/cuml/experimental/fil/detail/infer.hpp
index 140dd5e508..4a0625dee9 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/infer.hpp
@@ -15,20 +15,20 @@
  */
 #pragma once
 #include <cstddef>
+#include <cuml/experimental/fil/detail/index_type.hpp>
+#include <cuml/experimental/fil/detail/infer/cpu.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
 #include <iostream>
 #include <optional>
 #include <type_traits>
-#include <cuml/experimental/fil/infer_kind.hpp>
-#include <cuml/experimental/fil/detail/index_type.hpp>
-#include <cuml/experimental/fil/detail/infer/cpu.hpp>
 #ifdef CUML_ENABLE_GPU
 #include <cuml/experimental/fil/detail/infer/gpu.hpp>
 #endif
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
-#include <cuml/experimental/fil/exceptions.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/exceptions.hpp>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -61,131 +61,118 @@ namespace detail {
  * @param device The device on which to execute evaluation
  * @param stream Optionally, the CUDA stream to use
  */
-template<raft_proto::device_type D, typename forest_t>
-void infer(
-  forest_t const& forest,
-  postprocessor<typename forest_t::io_type> const& postproc,
-  typename forest_t::io_type* output,
-  typename forest_t::io_type* input,
-  index_type row_count,
-  index_type col_count,
-  index_type output_count,
-  bool has_categorical_nodes,
-  typename forest_t::io_type* vector_output=nullptr,
-  typename forest_t::node_type::index_type* categorical_data=nullptr,
-  infer_kind infer_type=infer_kind::default_kind,
-  std::optional<index_type> specified_chunk_size=std::nullopt,
-  raft_proto::device_id<D> device=raft_proto::device_id<D>{},
-  raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-) {
+template <raft_proto::device_type D, typename forest_t>
+void infer(forest_t const& forest,
+           postprocessor<typename forest_t::io_type> const& postproc,
+           typename forest_t::io_type* output,
+           typename forest_t::io_type* input,
+           index_type row_count,
+           index_type col_count,
+           index_type output_count,
+           bool has_categorical_nodes,
+           typename forest_t::io_type* vector_output                  = nullptr,
+           typename forest_t::node_type::index_type* categorical_data = nullptr,
+           infer_kind infer_type                                      = infer_kind::default_kind,
+           std::optional<index_type> specified_chunk_size             = std::nullopt,
+           raft_proto::device_id<D> device                            = raft_proto::device_id<D>{},
+           raft_proto::cuda_stream stream                             = raft_proto::cuda_stream{})
+{
   if (vector_output == nullptr) {
     if (categorical_data == nullptr) {
       if (!has_categorical_nodes) {
-        inference::infer<D, false, forest_t, std::nullptr_t, std::nullptr_t> (
-          forest,
-          postproc,
-          output,
-          input,
-          row_count,
-          col_count,
-          output_count,
-          nullptr,
-          nullptr,
-          infer_type,
-          specified_chunk_size,
-          device,
-          stream
-        );
+        inference::infer<D, false, forest_t, std::nullptr_t, std::nullptr_t>(forest,
+                                                                             postproc,
+                                                                             output,
+                                                                             input,
+                                                                             row_count,
+                                                                             col_count,
+                                                                             output_count,
+                                                                             nullptr,
+                                                                             nullptr,
+                                                                             infer_type,
+                                                                             specified_chunk_size,
+                                                                             device,
+                                                                             stream);
       } else {
-        inference::infer<D, true, forest_t, std::nullptr_t, std::nullptr_t> (
-          forest,
-          postproc,
-          output,
-          input,
-          row_count,
-          col_count,
-          output_count,
-          nullptr,
-          nullptr,
-          infer_type,
-          specified_chunk_size,
-          device,
-          stream
-        );
+        inference::infer<D, true, forest_t, std::nullptr_t, std::nullptr_t>(forest,
+                                                                            postproc,
+                                                                            output,
+                                                                            input,
+                                                                            row_count,
+                                                                            col_count,
+                                                                            output_count,
+                                                                            nullptr,
+                                                                            nullptr,
+                                                                            infer_type,
+                                                                            specified_chunk_size,
+                                                                            device,
+                                                                            stream);
       }
     } else {
-      inference::infer<D, true, forest_t> (
-        forest,
-        postproc,
-        output,
-        input,
-        row_count,
-        col_count,
-        output_count,
-        nullptr,
-        categorical_data,
-        infer_type,
-        specified_chunk_size,
-        device,
-        stream
-      );
+      inference::infer<D, true, forest_t>(forest,
+                                          postproc,
+                                          output,
+                                          input,
+                                          row_count,
+                                          col_count,
+                                          output_count,
+                                          nullptr,
+                                          categorical_data,
+                                          infer_type,
+                                          specified_chunk_size,
+                                          device,
+                                          stream);
     }
   } else {
     if (categorical_data == nullptr) {
       if (!has_categorical_nodes) {
-        inference::infer<D, false, forest_t> (
-          forest,
-          postproc,
-          output,
-          input,
-          row_count,
-          col_count,
-          output_count,
-          vector_output,
-          nullptr,
-          infer_type,
-          specified_chunk_size,
-          device,
-          stream
-        );
+        inference::infer<D, false, forest_t>(forest,
+                                             postproc,
+                                             output,
+                                             input,
+                                             row_count,
+                                             col_count,
+                                             output_count,
+                                             vector_output,
+                                             nullptr,
+                                             infer_type,
+                                             specified_chunk_size,
+                                             device,
+                                             stream);
       } else {
-        inference::infer<D, true, forest_t> (
-          forest,
-          postproc,
-          output,
-          input,
-          row_count,
-          col_count,
-          output_count,
-          vector_output,
-          nullptr,
-          infer_type,
-          specified_chunk_size,
-          device,
-          stream
-        );
+        inference::infer<D, true, forest_t>(forest,
+                                            postproc,
+                                            output,
+                                            input,
+                                            row_count,
+                                            col_count,
+                                            output_count,
+                                            vector_output,
+                                            nullptr,
+                                            infer_type,
+                                            specified_chunk_size,
+                                            device,
+                                            stream);
       }
     } else {
-      inference::infer<D, true, forest_t> (
-        forest,
-        postproc,
-        output,
-        input,
-        row_count,
-        col_count,
-        output_count,
-        vector_output,
-        categorical_data,
-        infer_type,
-        specified_chunk_size,
-        device,
-        stream
-      );
+      inference::infer<D, true, forest_t>(forest,
+                                          postproc,
+                                          output,
+                                          input,
+                                          row_count,
+                                          col_count,
+                                          output_count,
+                                          vector_output,
+                                          categorical_data,
+                                          infer_type,
+                                          specified_chunk_size,
+                                          device,
+                                          stream);
     }
   }
 }
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp
index 8647da404e..b1f3a092f4 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp
@@ -15,19 +15,19 @@
  */
 #pragma once
 #include <cstddef>
-#include <optional>
 #include <cuml/experimental/fil/constants.hpp>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/cpu_introspection.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/infer_kernel/cpu.hpp>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
-#include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <optional>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -73,29 +73,29 @@ namespace inference {
  * (for individual row inference) to 512 (for very large batch
  * inference). A value of 64 is a generally-useful default.
  */
-template<
-  raft_proto::device_type D,
-  bool has_categorical_nodes,
-  typename forest_t,
-  typename vector_output_t=std::nullptr_t,
-  typename categorical_data_t=std::nullptr_t
->
-std::enable_if_t<std::disjunction_v<std::bool_constant<D==raft_proto::device_type::cpu>, std::bool_constant<!raft_proto::GPU_ENABLED>>, void> infer(
-  forest_t const& forest,
-  postprocessor<typename forest_t::io_type> const& postproc,
-  typename forest_t::io_type* output,
-  typename forest_t::io_type* input,
-  index_type row_count,
-  index_type col_count,
-  index_type output_count,
-  vector_output_t vector_output=nullptr,
-  categorical_data_t categorical_data=nullptr,
-  infer_kind infer_type=infer_kind::default_kind,
-  std::optional<index_type> specified_chunk_size=std::nullopt,
-  raft_proto::device_id<D> device=raft_proto::device_id<D>{},
-  raft_proto::cuda_stream=raft_proto::cuda_stream{}
-) {
-  if constexpr(D==raft_proto::device_type::gpu) {
+template <raft_proto::device_type D,
+          bool has_categorical_nodes,
+          typename forest_t,
+          typename vector_output_t    = std::nullptr_t,
+          typename categorical_data_t = std::nullptr_t>
+std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
+                                    std::bool_constant<!raft_proto::GPU_ENABLED>>,
+                 void>
+infer(forest_t const& forest,
+      postprocessor<typename forest_t::io_type> const& postproc,
+      typename forest_t::io_type* output,
+      typename forest_t::io_type* input,
+      index_type row_count,
+      index_type col_count,
+      index_type output_count,
+      vector_output_t vector_output                  = nullptr,
+      categorical_data_t categorical_data            = nullptr,
+      infer_kind infer_type                          = infer_kind::default_kind,
+      std::optional<index_type> specified_chunk_size = std::nullopt,
+      raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
+      raft_proto::cuda_stream                        = raft_proto::cuda_stream{})
+{
+  if constexpr (D == raft_proto::device_type::gpu) {
     throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
   } else {
     infer_kernel_cpu<has_categorical_nodes>(
@@ -110,8 +110,7 @@ std::enable_if_t<std::disjunction_v<std::bool_constant<D==raft_proto::device_typ
       hardware_constructive_interference_size,
       vector_output,
       categorical_data,
-      infer_type
-    );
+      infer_type);
   }
 }
 
@@ -129,9 +128,9 @@ CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
 
-}
-}
-}
+}  // namespace inference
+}  // namespace detail
+}  // namespace fil
 
-}
-}
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer/gpu.cuh b/cpp/include/cuml/experimental/fil/detail/infer/gpu.cuh
index c684a7a827..792ec9ad98 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer/gpu.cuh
+++ b/cpp/include/cuml/experimental/fil/detail/infer/gpu.cuh
@@ -15,17 +15,12 @@
  */
 #pragma once
 #include <cstddef>
-#include <optional>
-#include <type_traits>
 #include <cuml/experimental/fil/constants.hpp>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/gpu_introspection.hpp>
-#include <cuml/experimental/fil/detail/infer_kernel/gpu.cuh>
 #include <cuml/experimental/fil/detail/index_type.hpp>
+#include <cuml/experimental/fil/detail/infer_kernel/gpu.cuh>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
-#include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
-#include <cuml/experimental/fil/exceptions.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/ceildiv.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
@@ -33,6 +28,11 @@
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/padding.hpp>
+#include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
+#include <cuml/experimental/fil/exceptions.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <optional>
+#include <type_traits>
 
 namespace ML {
 namespace experimental {
@@ -40,18 +40,14 @@ namespace fil {
 namespace detail {
 namespace inference {
 
-inline auto compute_output_size(
-  index_type row_output_size,
-  index_type threads_per_block,
-  index_type rows_per_block_iteration,
-  infer_kind infer_type=infer_kind::default_kind
-) {
+inline auto compute_output_size(index_type row_output_size,
+                                index_type threads_per_block,
+                                index_type rows_per_block_iteration,
+                                infer_kind infer_type = infer_kind::default_kind)
+{
   auto result = row_output_size * rows_per_block_iteration;
   if (infer_type == infer_kind::default_kind) {
-    result *= raft_proto::ceildiv(
-      threads_per_block,
-      rows_per_block_iteration
-    );
+    result *= raft_proto::ceildiv(threads_per_block, rows_per_block_iteration);
   }
   return result;
 }
@@ -91,14 +87,12 @@ inline auto compute_output_size(
  * value depends on hardware, model, and batch size. Valid values are any power
  * of 2 from 1 to 32.
  */
-template<
-  raft_proto::device_type D,
-  bool has_categorical_nodes,
-  typename forest_t,
-  typename vector_output_t=std::nullptr_t,
-  typename categorical_data_t=std::nullptr_t
->
-std::enable_if_t<D==raft_proto::device_type::gpu, void> infer(
+template <raft_proto::device_type D,
+          bool has_categorical_nodes,
+          typename forest_t,
+          typename vector_output_t    = std::nullptr_t,
+          typename categorical_data_t = std::nullptr_t>
+std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   forest_t const& forest,
   postprocessor<typename forest_t::io_type> const& postproc,
   typename forest_t::io_type* output,
@@ -106,276 +100,197 @@ std::enable_if_t<D==raft_proto::device_type::gpu, void> infer(
   index_type row_count,
   index_type col_count,
   index_type output_count,
-  vector_output_t vector_output=nullptr,
-  categorical_data_t categorical_data=nullptr,
-  infer_kind infer_type=infer_kind::default_kind,
-  std::optional<index_type> specified_chunk_size=std::nullopt,
-  raft_proto::device_id<D> device=raft_proto::device_id<D>{},
-  raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-) {
+  vector_output_t vector_output                  = nullptr,
+  categorical_data_t categorical_data            = nullptr,
+  infer_kind infer_type                          = infer_kind::default_kind,
+  std::optional<index_type> specified_chunk_size = std::nullopt,
+  raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
+  raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{})
+{
   using output_t = typename forest_t::template raw_output_type<vector_output_t>;
 
-  auto sm_count = get_sm_count(device);
+  auto sm_count                       = get_sm_count(device);
   auto const max_shared_mem_per_block = get_max_shared_mem_per_block(device);
-  auto const max_shared_mem_per_sm = get_max_shared_mem_per_sm(device);
-  auto const max_overall_shared_mem = std::min(
-    max_shared_mem_per_block, max_shared_mem_per_sm
-  );
+  auto const max_shared_mem_per_sm    = get_max_shared_mem_per_sm(device);
+  auto const max_overall_shared_mem   = std::min(max_shared_mem_per_block, max_shared_mem_per_sm);
 
-  auto row_size_bytes = index_type(
-    index_type(sizeof(typename forest_t::io_type) * col_count)
-  );
+  auto row_size_bytes  = index_type(index_type(sizeof(typename forest_t::io_type) * col_count));
   auto row_output_size = output_count;
-  auto row_output_size_bytes = index_type(sizeof(
-    typename forest_t::io_type
-  ) * row_output_size);
+  auto row_output_size_bytes = index_type(sizeof(typename forest_t::io_type) * row_output_size);
 
   // First determine the number of threads per block. This is the indicated
   // preferred value unless we cannot handle at least 1 row per block iteration
   // with available shared memory, in which case we must reduce the threads per
   // block.
-  auto threads_per_block = min(
-    MAX_THREADS_PER_BLOCK,
-    raft_proto::downpadded_size(
-      (max_shared_mem_per_block  - row_size_bytes) / row_output_size_bytes,
-      WARP_SIZE
-    )
-  );
+  auto threads_per_block =
+    min(MAX_THREADS_PER_BLOCK,
+        raft_proto::downpadded_size(
+          (max_shared_mem_per_block - row_size_bytes) / row_output_size_bytes, WARP_SIZE));
 
   // If we cannot do at least a warp per block when storing input rows in
   // shared mem, recalculate our threads per block without input storage
   if (threads_per_block < WARP_SIZE) {
-    threads_per_block = min(
-      MAX_THREADS_PER_BLOCK,
-      raft_proto::downpadded_size(
-        max_shared_mem_per_block / row_output_size_bytes,
-        WARP_SIZE
-      )
-    );
+    threads_per_block =
+      min(MAX_THREADS_PER_BLOCK,
+          raft_proto::downpadded_size(max_shared_mem_per_block / row_output_size_bytes, WARP_SIZE));
     if (threads_per_block >= WARP_SIZE) {
-        row_size_bytes = index_type{};  // Do not store input rows in shared mem
+      row_size_bytes = index_type{};  // Do not store input rows in shared mem
     }
   }
 
   // If we cannot do at least a warp per block when storing output in
   // shared mem, recalculate our threads per block with ONLY input storage
   if (threads_per_block < WARP_SIZE) {
-    threads_per_block = min(
-      MAX_THREADS_PER_BLOCK,
-      raft_proto::downpadded_size(
-        max_shared_mem_per_block / row_size_bytes,
-        WARP_SIZE
-      )
-    );
+    threads_per_block =
+      min(MAX_THREADS_PER_BLOCK,
+          raft_proto::downpadded_size(max_shared_mem_per_block / row_size_bytes, WARP_SIZE));
   }
 
   // If we still cannot use at least a warp per block, give up on using
   // shared memory and just maximize occupancy
-  if (threads_per_block < WARP_SIZE) {
-    threads_per_block = MAX_THREADS_PER_BLOCK;
-  }
+  if (threads_per_block < WARP_SIZE) { threads_per_block = MAX_THREADS_PER_BLOCK; }
 
-  auto const max_resident_blocks = sm_count * (
-    get_max_threads_per_sm(device) / threads_per_block
-  );
+  auto const max_resident_blocks = sm_count * (get_max_threads_per_sm(device) / threads_per_block);
 
   // Compute shared memory usage based on minimum or specified
   // rows_per_block_iteration
-  auto rows_per_block_iteration = specified_chunk_size.value_or(
-    index_type{1}
-  );
+  auto rows_per_block_iteration          = specified_chunk_size.value_or(index_type{1});
   auto constexpr const output_item_bytes = index_type(sizeof(output_t));
-  auto output_workspace_size = compute_output_size(
-    row_output_size, threads_per_block, rows_per_block_iteration, infer_type
-  );
+  auto output_workspace_size =
+    compute_output_size(row_output_size, threads_per_block, rows_per_block_iteration, infer_type);
   auto output_workspace_size_bytes = output_item_bytes * output_workspace_size;
-  auto global_workspace = raft_proto::buffer<output_t>{};
+  auto global_workspace            = raft_proto::buffer<output_t>{};
 
   if (output_workspace_size_bytes > max_shared_mem_per_block) {
     output_workspace_size_bytes = 0;
-    row_output_size = 0;
+    row_output_size             = 0;
   }
-  auto shared_mem_per_block = min(
-    rows_per_block_iteration * row_size_bytes + output_workspace_size_bytes,
-    max_overall_shared_mem
-  );
+  auto shared_mem_per_block =
+    min(rows_per_block_iteration * row_size_bytes + output_workspace_size_bytes,
+        max_overall_shared_mem);
 
-  auto resident_blocks_per_sm = min(
-    raft_proto::ceildiv(max_shared_mem_per_sm, shared_mem_per_block),
-    max_resident_blocks
-  );
+  auto resident_blocks_per_sm =
+    min(raft_proto::ceildiv(max_shared_mem_per_sm, shared_mem_per_block), max_resident_blocks);
 
   // If caller has not specified the number of rows per block iteration, apply
   // the following heuristic to identify an approximately optimal value
-  if (
-    !specified_chunk_size.has_value()
-    && resident_blocks_per_sm >= MIN_BLOCKS_PER_SM
-  ) {
+  if (!specified_chunk_size.has_value() && resident_blocks_per_sm >= MIN_BLOCKS_PER_SM) {
     rows_per_block_iteration = index_type{32};
   }
 
   do {
-    output_workspace_size = compute_output_size(
-      row_output_size, threads_per_block, rows_per_block_iteration, infer_type
-    );
+    output_workspace_size =
+      compute_output_size(row_output_size, threads_per_block, rows_per_block_iteration, infer_type);
     output_workspace_size_bytes = output_item_bytes * output_workspace_size;
 
-    shared_mem_per_block = (
-      rows_per_block_iteration * row_size_bytes + output_workspace_size_bytes
-    );
+    shared_mem_per_block =
+      (rows_per_block_iteration * row_size_bytes + output_workspace_size_bytes);
     if (shared_mem_per_block > max_overall_shared_mem) {
       rows_per_block_iteration >>= index_type{1};
     }
-  } while (
-    shared_mem_per_block > max_overall_shared_mem
-    && rows_per_block_iteration > 1
-  );
+  } while (shared_mem_per_block > max_overall_shared_mem && rows_per_block_iteration > 1);
 
   shared_mem_per_block = std::min(shared_mem_per_block, max_overall_shared_mem);
 
   // Divide shared mem evenly
-  shared_mem_per_block = std::min(max_overall_shared_mem, max_shared_mem_per_sm / (
-    max_shared_mem_per_sm / shared_mem_per_block
-  ));
+  shared_mem_per_block = std::min(
+    max_overall_shared_mem, max_shared_mem_per_sm / (max_shared_mem_per_sm / shared_mem_per_block));
 
-  auto num_blocks = std::min(
-    raft_proto::ceildiv(row_count, rows_per_block_iteration),
-    MAX_BLOCKS
-  );
+  auto num_blocks = std::min(raft_proto::ceildiv(row_count, rows_per_block_iteration), MAX_BLOCKS);
   if (row_output_size == 0) {
     global_workspace = raft_proto::buffer<output_t>{
-      output_workspace_size * num_blocks,
-      raft_proto::device_type::gpu,
-      device.value(),
-      stream
-    };
+      output_workspace_size * num_blocks, raft_proto::device_type::gpu, device.value(), stream};
   }
   if (rows_per_block_iteration <= 1) {
-    infer_kernel<has_categorical_nodes, 1><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 1>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   } else if (rows_per_block_iteration <= 2) {
-    infer_kernel<has_categorical_nodes, 2><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 2>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   } else if (rows_per_block_iteration <= 4) {
-    infer_kernel<has_categorical_nodes, 4><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 4>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   } else if (rows_per_block_iteration <= 8) {
-    infer_kernel<has_categorical_nodes, 8><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 8>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   } else if (rows_per_block_iteration <= 16) {
-    infer_kernel<has_categorical_nodes, 16><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 16>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   } else {
-    infer_kernel<has_categorical_nodes, 32><<<
-      num_blocks,
-      threads_per_block,
-      shared_mem_per_block,
-      stream
-    >>>(
-      forest,
-      postproc,
-      output,
-      input,
-      row_count,
-      col_count,
-      output_count,
-      shared_mem_per_block,
-      output_workspace_size,
-      vector_output,
-      categorical_data,
-      infer_type,
-      global_workspace.data()
-    );
+    infer_kernel<has_categorical_nodes, 32>
+      <<<num_blocks, threads_per_block, shared_mem_per_block, stream>>>(forest,
+                                                                        postproc,
+                                                                        output,
+                                                                        input,
+                                                                        row_count,
+                                                                        col_count,
+                                                                        output_count,
+                                                                        shared_mem_per_block,
+                                                                        output_workspace_size,
+                                                                        vector_output,
+                                                                        categorical_data,
+                                                                        infer_type,
+                                                                        global_workspace.data());
   }
   raft_proto::cuda_check(cudaGetLastError());
 }
@@ -394,8 +309,8 @@ CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::gpu, 5)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::gpu, 6)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::gpu, 7)
 
-}
-}
-}
-}
-}
+}  // namespace inference
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/infer/gpu.hpp
index 6e48d55f0e..795ce49ed5 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/infer/gpu.hpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 #include <cstddef>
-#include <optional>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <optional>
 
 namespace ML {
 namespace experimental {
@@ -30,14 +30,12 @@ namespace detail {
 namespace inference {
 
 /* The CUDA-free header declaration of the GPU infer template */
-template<
-  raft_proto::device_type D,
-  bool has_categorical_nodes,
-  typename forest_t,
-  typename vector_output_t=std::nullptr_t,
-  typename categorical_data_t=std::nullptr_t
->
-std::enable_if_t<D==raft_proto::device_type::gpu, void> infer(
+template <raft_proto::device_type D,
+          bool has_categorical_nodes,
+          typename forest_t,
+          typename vector_output_t    = std::nullptr_t,
+          typename categorical_data_t = std::nullptr_t>
+std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   forest_t const& forest,
   postprocessor<typename forest_t::io_type> const& postproc,
   typename forest_t::io_type* output,
@@ -45,16 +43,15 @@ std::enable_if_t<D==raft_proto::device_type::gpu, void> infer(
   index_type row_count,
   index_type col_count,
   index_type class_count,
-  vector_output_t vector_output=nullptr,
-  categorical_data_t categorical_data=nullptr,
-  infer_kind infer_type = infer_kind::default_kind,
-  std::optional<index_type> specified_chunk_size=std::nullopt,
-  raft_proto::device_id<D> device=raft_proto::device_id<D>{},
-  raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-);
+  vector_output_t vector_output                  = nullptr,
+  categorical_data_t categorical_data            = nullptr,
+  infer_kind infer_type                          = infer_kind::default_kind,
+  std::optional<index_type> specified_chunk_size = std::nullopt,
+  raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
+  raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{});
 
-}
-}
-}
-}
-}
+}  // namespace inference
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer_kernel/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/infer_kernel/cpu.hpp
index 08a9819c27..1fa0a03675 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer_kernel/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/infer_kernel/cpu.hpp
@@ -15,16 +15,16 @@
  */
 #pragma once
 #include <cstddef>
-#include <iostream>
-#include <new>
-#include <numeric>
-#include <vector>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/cpu_introspection.hpp>
 #include <cuml/experimental/fil/detail/evaluate_tree.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/ceildiv.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <iostream>
+#include <new>
+#include <numeric>
+#include <vector>
 
 namespace ML {
 namespace experimental {
@@ -64,134 +64,101 @@ namespace detail {
  * and produce an output per row. If set to "per_tree", we will instead output all outputs of
  * individual trees.
  */
-template<
-  bool has_categorical_nodes,
-  typename forest_t,
-  typename vector_output_t=std::nullptr_t,
-  typename categorical_data_t=std::nullptr_t
->
-void infer_kernel_cpu(
-    forest_t const& forest,
-    postprocessor<typename forest_t::io_type> const& postproc,
-    typename forest_t::io_type* output,
-    typename forest_t::io_type const* input,
-    index_type row_count,
-    index_type col_count,
-    index_type num_outputs,
-    index_type chunk_size=hardware_constructive_interference_size,
-    index_type grove_size=hardware_constructive_interference_size,
-    vector_output_t vector_output_p=nullptr,
-    categorical_data_t categorical_data=nullptr,
-    infer_kind infer_type=infer_kind::default_kind
-) {
-  auto const default_num_outputs = forest.num_outputs();
-  auto constexpr has_vector_leaves = !std::is_same_v<vector_output_t, std::nullptr_t>;
+template <bool has_categorical_nodes,
+          typename forest_t,
+          typename vector_output_t    = std::nullptr_t,
+          typename categorical_data_t = std::nullptr_t>
+void infer_kernel_cpu(forest_t const& forest,
+                      postprocessor<typename forest_t::io_type> const& postproc,
+                      typename forest_t::io_type* output,
+                      typename forest_t::io_type const* input,
+                      index_type row_count,
+                      index_type col_count,
+                      index_type num_outputs,
+                      index_type chunk_size               = hardware_constructive_interference_size,
+                      index_type grove_size               = hardware_constructive_interference_size,
+                      vector_output_t vector_output_p     = nullptr,
+                      categorical_data_t categorical_data = nullptr,
+                      infer_kind infer_type               = infer_kind::default_kind)
+{
+  auto const default_num_outputs         = forest.num_outputs();
+  auto constexpr has_vector_leaves       = !std::is_same_v<vector_output_t, std::nullptr_t>;
   auto constexpr has_nonlocal_categories = !std::is_same_v<categorical_data_t, std::nullptr_t>;
-  
+
   using node_t = typename forest_t::node_type;
 
   using output_t = typename forest_t::template raw_output_type<vector_output_t>;
 
-  auto const num_tree = forest.tree_count();
+  auto const num_tree  = forest.tree_count();
   auto const num_grove = raft_proto::ceildiv(num_tree, grove_size);
   auto const num_chunk = raft_proto::ceildiv(row_count, chunk_size);
 
-  auto output_workspace = std::vector<output_t>(
-    row_count * num_outputs * num_grove,
-    output_t{}
-  );
+  auto output_workspace = std::vector<output_t>(row_count * num_outputs * num_grove, output_t{});
   auto const task_count = num_grove * num_chunk;
 
   // Infer on each grove and chunk
 #pragma omp parallel for
-  for(auto task_index = index_type{}; task_index < task_count; ++task_index) {
+  for (auto task_index = index_type{}; task_index < task_count; ++task_index) {
     auto const grove_index = task_index / num_chunk;
     auto const chunk_index = task_index % num_chunk;
-    auto const start_row = chunk_index * chunk_size;
-    auto const end_row = std::min(start_row + chunk_size, row_count);
-    auto const start_tree = grove_index * grove_size;
-    auto const end_tree = std::min(start_tree + grove_size, num_tree);
+    auto const start_row   = chunk_index * chunk_size;
+    auto const end_row     = std::min(start_row + chunk_size, row_count);
+    auto const start_tree  = grove_index * grove_size;
+    auto const end_tree    = std::min(start_tree + grove_size, num_tree);
 
-    for (auto row_index = start_row; row_index < end_row; ++row_index){
+    for (auto row_index = start_row; row_index < end_row; ++row_index) {
       for (auto tree_index = start_tree; tree_index < end_tree; ++tree_index) {
-        auto tree_output = std::conditional_t<
-          has_vector_leaves, typename node_t::index_type, typename node_t::threshold_type
-        >{};
+        auto tree_output = std::conditional_t<has_vector_leaves,
+                                              typename node_t::index_type,
+                                              typename node_t::threshold_type>{};
         if constexpr (has_nonlocal_categories) {
           tree_output = evaluate_tree<has_vector_leaves>(
-            forest.get_tree_root(tree_index),
-            input + row_index * col_count,
-            categorical_data
-          );
+            forest.get_tree_root(tree_index), input + row_index * col_count, categorical_data);
         } else {
           tree_output = evaluate_tree<has_vector_leaves, has_categorical_nodes>(
-            forest.get_tree_root(tree_index),
-            input + row_index * col_count
-          );
+            forest.get_tree_root(tree_index), input + row_index * col_count);
         }
         if constexpr (has_vector_leaves) {
-          auto output_offset = (
-            row_index * num_outputs * num_grove
-            + tree_index * default_num_outputs * num_grove * (
-              infer_type == infer_kind::per_tree
-            ) + grove_index
-          );
-          for (
-            auto output_index=index_type{};
-            output_index < default_num_outputs;
-            ++output_index
-          ) {
-            output_workspace[
-              output_offset + output_index * num_grove
-            ] += vector_output_p[
-              tree_output * default_num_outputs + output_index
-            ];
+          auto output_offset =
+            (row_index * num_outputs * num_grove +
+             tree_index * default_num_outputs * num_grove * (infer_type == infer_kind::per_tree) +
+             grove_index);
+          for (auto output_index = index_type{}; output_index < default_num_outputs;
+               ++output_index) {
+            output_workspace[output_offset + output_index * num_grove] +=
+              vector_output_p[tree_output * default_num_outputs + output_index];
           }
         } else {
-          auto output_offset = (
-            row_index * num_outputs * num_grove
-            + (tree_index % default_num_outputs) * num_grove * (
-              infer_type == infer_kind::default_kind
-            ) + tree_index * num_grove * (
-              infer_type == infer_kind::per_tree
-            ) + grove_index
-          );
-          output_workspace[
-            output_offset
-          ] += tree_output;
+          auto output_offset =
+            (row_index * num_outputs * num_grove +
+             (tree_index % default_num_outputs) * num_grove *
+               (infer_type == infer_kind::default_kind) +
+             tree_index * num_grove * (infer_type == infer_kind::per_tree) + grove_index);
+          output_workspace[output_offset] += tree_output;
         }
       }  // Trees
-    }  // Rows
-  }  // Tasks
+    }    // Rows
+  }      // Tasks
 
   // Sum over grove and postprocess
 #pragma omp parallel for
-  for (auto row_index=index_type{}; row_index < row_count; ++row_index) {
-    for (
-      auto output_index = index_type{};
-      output_index < num_outputs;
-      ++output_index
-    ) {
-      auto grove_offset = (
-        row_index * num_outputs * num_grove + output_index * num_grove
-      );
+  for (auto row_index = index_type{}; row_index < row_count; ++row_index) {
+    for (auto output_index = index_type{}; output_index < num_outputs; ++output_index) {
+      auto grove_offset = (row_index * num_outputs * num_grove + output_index * num_grove);
 
-      output_workspace[grove_offset] = std::accumulate(
-        std::begin(output_workspace) + grove_offset,
-        std::begin(output_workspace) + grove_offset + num_grove,
-        output_t{}
-      );
+      output_workspace[grove_offset] =
+        std::accumulate(std::begin(output_workspace) + grove_offset,
+                        std::begin(output_workspace) + grove_offset + num_grove,
+                        output_t{});
     }
-    postproc(
-      output_workspace.data() + row_index * num_outputs * num_grove,
-      num_outputs,
-      output + row_index * num_outputs,
-      num_grove
-    );
+    postproc(output_workspace.data() + row_index * num_outputs * num_grove,
+             num_outputs,
+             output + row_index * num_outputs,
+             num_grove);
   }
 }
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh b/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
index bca359d234..e4189e7a84 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
+++ b/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
@@ -15,15 +15,15 @@
  */
 #pragma once
 #include <cstddef>
-#include <stddef.h>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/evaluate_tree.hpp>
 #include <cuml/experimental/fil/detail/gpu_introspection.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
-#include <cuml/experimental/fil/detail/postprocessor.hpp>
 #include <cuml/experimental/fil/detail/infer_kernel/shared_memory_buffer.cuh>
+#include <cuml/experimental/fil/detail/postprocessor.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/ceildiv.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/padding.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <stddef.h>
 
 namespace ML {
 namespace experimental {
@@ -72,31 +72,28 @@ namespace detail {
  * @param global_mem_fallback_buffer Buffer to use as a fallback, when there isn't enough shared
  * memory. Set it to nullptr to disable
  */
-template<
-  bool has_categorical_nodes,
-  index_type chunk_size,
-  typename forest_t,
-  typename vector_output_t=std::nullptr_t,
-  typename categorical_data_t=std::nullptr_t
->
-__global__ void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
-infer_kernel(
-    forest_t forest,
-    postprocessor<typename forest_t::io_type> postproc,
-    typename forest_t::io_type* output,
-    typename forest_t::io_type const* input,
-    index_type row_count,
-    index_type col_count,
-    index_type num_outputs,
-    index_type shared_mem_byte_size,
-    index_type output_workspace_size,
-    vector_output_t vector_output_p=nullptr,
-    categorical_data_t categorical_data=nullptr,
-    infer_kind infer_type=infer_kind::default_kind,
-    typename forest_t::template raw_output_type<vector_output_t>* workspace_fallback=nullptr
-) {
-  auto const default_num_outputs = forest.num_outputs();
-  auto constexpr has_vector_leaves = !std::is_same_v<vector_output_t, std::nullptr_t>;
+template <bool has_categorical_nodes,
+          index_type chunk_size,
+          typename forest_t,
+          typename vector_output_t    = std::nullptr_t,
+          typename categorical_data_t = std::nullptr_t>
+__global__ void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) infer_kernel(
+  forest_t forest,
+  postprocessor<typename forest_t::io_type> postproc,
+  typename forest_t::io_type* output,
+  typename forest_t::io_type const* input,
+  index_type row_count,
+  index_type col_count,
+  index_type num_outputs,
+  index_type shared_mem_byte_size,
+  index_type output_workspace_size,
+  vector_output_t vector_output_p     = nullptr,
+  categorical_data_t categorical_data = nullptr,
+  infer_kind infer_type               = infer_kind::default_kind,
+  typename forest_t::template raw_output_type<vector_output_t>* workspace_fallback = nullptr)
+{
+  auto const default_num_outputs         = forest.num_outputs();
+  auto constexpr has_vector_leaves       = !std::is_same_v<vector_output_t, std::nullptr_t>;
   auto constexpr has_nonlocal_categories = !std::is_same_v<categorical_data_t, std::nullptr_t>;
   using output_t = typename forest_t::template raw_output_type<vector_output_t>;
   extern __shared__ std::byte shared_mem_raw[];
@@ -107,38 +104,22 @@ infer_kernel(
 
   using io_t = typename forest_t::io_type;
 
-  for (
-    auto i=blockIdx.x * chunk_size;
-    i < row_count;
-    i += chunk_size * gridDim.x
-  ) {
-
+  for (auto i = blockIdx.x * chunk_size; i < row_count; i += chunk_size * gridDim.x) {
     shared_mem.clear();
     auto* output_workspace = shared_mem.fill<output_t>(
-      output_workspace_size, output_t{}, (
-        workspace_fallback +
-        blockIdx.x * output_workspace_size
-      )
-    );
+      output_workspace_size, output_t{}, (workspace_fallback + blockIdx.x * output_workspace_size));
 
     // Handle as many rows as requested per loop or as many rows as are left to
     // process
     auto rows_in_this_iteration = min(chunk_size, row_count - i);
 
-    auto* input_data = shared_mem.copy(
-      input + i * col_count,
-      rows_in_this_iteration,
-      col_count
-    );
+    auto* input_data = shared_mem.copy(input + i * col_count, rows_in_this_iteration, col_count);
 
     auto task_count = chunk_size * forest.tree_count();
 
-    auto num_grove = raft_proto::ceildiv(
-      min(index_type(blockDim.x), task_count),
-      chunk_size
-    ) * (infer_type == infer_kind::default_kind) + (
-      infer_type != infer_kind::default_kind
-    );
+    auto num_grove = raft_proto::ceildiv(min(index_type(blockDim.x), task_count), chunk_size) *
+                       (infer_type == infer_kind::default_kind) +
+                     (infer_type != infer_kind::default_kind);
 
     // Note that this sync is safe because every thread in the block will agree
     // on whether or not a sync is required
@@ -151,129 +132,78 @@ infer_kernel(
     auto const task_count_rounded_up = blockDim.x * raft_proto::ceildiv(task_count, blockDim.x);
 
     // Infer on each tree and row
-    for (
-      auto task_index = threadIdx.x;
-      task_index < task_count_rounded_up;
-      task_index += blockDim.x
-    ) {
+    for (auto task_index = threadIdx.x; task_index < task_count_rounded_up;
+         task_index += blockDim.x) {
       auto row_index = task_index % chunk_size;
       auto real_task = task_index < task_count && row_index < rows_in_this_iteration;
       row_index *= real_task;
-      auto tree_index = task_index * real_task / chunk_size;
-      auto grove_index = (threadIdx.x / chunk_size)  * (
-        infer_type == infer_kind::default_kind
-      );
+      auto tree_index  = task_index * real_task / chunk_size;
+      auto grove_index = (threadIdx.x / chunk_size) * (infer_type == infer_kind::default_kind);
 
-      auto tree_output = std::conditional_t<
-        has_vector_leaves, typename node_t::index_type, typename node_t::threshold_type
-      >{};
+      auto tree_output = std::conditional_t<has_vector_leaves,
+                                            typename node_t::index_type,
+                                            typename node_t::threshold_type>{};
       if constexpr (has_nonlocal_categories) {
         tree_output = evaluate_tree<has_vector_leaves>(
-          forest.get_tree_root(tree_index),
-          input_data + row_index * col_count,
-          categorical_data
-        );
+          forest.get_tree_root(tree_index), input_data + row_index * col_count, categorical_data);
       } else {
         tree_output = evaluate_tree<has_vector_leaves, has_categorical_nodes>(
-          forest.get_tree_root(tree_index),
-          input_data + row_index * col_count
-        );
+          forest.get_tree_root(tree_index), input_data + row_index * col_count);
       }
 
       if constexpr (has_vector_leaves) {
-        auto output_offset = (
-          row_index * num_outputs * num_grove
-          + tree_index * default_num_outputs * num_grove * (
-            infer_type == infer_kind::per_tree
-          ) + grove_index
-        );
-        for (
-          auto output_index=index_type{};
-          output_index < default_num_outputs;
-          ++output_index
-        ) {
+        auto output_offset =
+          (row_index * num_outputs * num_grove +
+           tree_index * default_num_outputs * num_grove * (infer_type == infer_kind::per_tree) +
+           grove_index);
+        for (auto output_index = index_type{}; output_index < default_num_outputs; ++output_index) {
           if (real_task) {
-            output_workspace[
-              output_offset + output_index * num_grove
-            ] += vector_output_p[
-              tree_output * default_num_outputs
-              + output_index
-            ];
+            output_workspace[output_offset + output_index * num_grove] +=
+              vector_output_p[tree_output * default_num_outputs + output_index];
           }
         }
       } else {
-        auto output_offset = (
-          row_index * num_outputs * num_grove
-          + (tree_index % default_num_outputs) * num_grove * (
-            infer_type == infer_kind::default_kind
-          ) + tree_index * num_grove * (
-            infer_type == infer_kind::per_tree
-          ) + grove_index
-        );
-        if (real_task) {
-          output_workspace[
-            output_offset
-          ] += tree_output;
-        }
+        auto output_offset =
+          (row_index * num_outputs * num_grove +
+           (tree_index % default_num_outputs) * num_grove *
+             (infer_type == infer_kind::default_kind) +
+           tree_index * num_grove * (infer_type == infer_kind::per_tree) + grove_index);
+        if (real_task) { output_workspace[output_offset] += tree_output; }
       }
 
       __syncthreads();
     }
 
     auto padded_num_groves = raft_proto::padded_size(num_grove, WARP_SIZE);
-    for (
-      auto row_index = threadIdx.x / WARP_SIZE;
-      row_index < rows_in_this_iteration;
-      row_index += blockDim.x / WARP_SIZE
-    ) {
-      for (
-        auto class_index = index_type{};
-        class_index < num_outputs;
-        ++class_index
-      ) {
-        auto grove_offset = (
-          row_index * num_outputs * num_grove + class_index * num_grove
-        );
-        auto class_sum = output_t{};
-        for (
-          auto grove_index = threadIdx.x % WARP_SIZE;
-          grove_index < padded_num_groves;
-          grove_index += WARP_SIZE
-        ) {
+    for (auto row_index = threadIdx.x / WARP_SIZE; row_index < rows_in_this_iteration;
+         row_index += blockDim.x / WARP_SIZE) {
+      for (auto class_index = index_type{}; class_index < num_outputs; ++class_index) {
+        auto grove_offset = (row_index * num_outputs * num_grove + class_index * num_grove);
+        auto class_sum    = output_t{};
+        for (auto grove_index = threadIdx.x % WARP_SIZE; grove_index < padded_num_groves;
+             grove_index += WARP_SIZE) {
           auto real_thread = grove_index < num_grove;
-          auto out_index = grove_offset + grove_index * real_thread;
+          auto out_index   = grove_offset + grove_index * real_thread;
           class_sum *= (threadIdx.x % WARP_SIZE == 0);
           class_sum += output_workspace[out_index] * real_thread;
-          for (
-            auto thread_offset = (WARP_SIZE >> 1); 
-            thread_offset > 0;
-            thread_offset >>= 1
-          ) {
-            class_sum += __shfl_down_sync(
-              0xFFFFFFFF,
-              class_sum,
-              thread_offset
-            );
+          for (auto thread_offset = (WARP_SIZE >> 1); thread_offset > 0; thread_offset >>= 1) {
+            class_sum += __shfl_down_sync(0xFFFFFFFF, class_sum, thread_offset);
           }
         }
-        if (threadIdx.x % WARP_SIZE == 0) {
-          output_workspace[grove_offset] = class_sum;
-        }
+        if (threadIdx.x % WARP_SIZE == 0) { output_workspace[grove_offset] = class_sum; }
       }
       if (threadIdx.x % WARP_SIZE == 0) {
-        postproc(
-          output_workspace + row_index * num_outputs * num_grove,
-          num_outputs, 
-          output + ((i + row_index) * num_outputs),
-          num_grove
-        );
+        postproc(output_workspace + row_index * num_outputs * num_grove,
+                 num_outputs,
+                 output + ((i + row_index) * num_outputs),
+                 num_grove);
       }
     }
     __syncthreads();
   }
 }
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/infer_kernel/shared_memory_buffer.cuh b/cpp/include/cuml/experimental/fil/detail/infer_kernel/shared_memory_buffer.cuh
index d99fc8c344..f3b6b2ac34 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer_kernel/shared_memory_buffer.cuh
+++ b/cpp/include/cuml/experimental/fil/detail/infer_kernel/shared_memory_buffer.cuh
@@ -15,9 +15,9 @@
  */
 #pragma once
 #include <cstddef>
+#include <cuml/experimental/fil/detail/index_type.hpp>
 #include <stddef.h>
 #include <type_traits>
-#include <cuml/experimental/fil/detail/index_type.hpp>
 
 namespace ML {
 namespace experimental {
@@ -29,8 +29,10 @@ namespace fil {
  * @param size The size in bytes of the shared memory allocation
  */
 struct shared_memory_buffer {
-  __device__ shared_memory_buffer(std::byte* buffer=nullptr, index_type size=index_type{}) :
-    data{buffer}, total_size{size}, remaining_data{buffer}, remaining_size{size} {}
+  __device__ shared_memory_buffer(std::byte* buffer = nullptr, index_type size = index_type{})
+    : data{buffer}, total_size{size}, remaining_data{buffer}, remaining_size{size}
+  {
+  }
 
   /* If possible, copy the given number of rows with the given number of columns from source
    * to the end of this buffer, padding each row by the given number of
@@ -38,12 +40,14 @@ struct shared_memory_buffer {
    * room, no copy is performed. Return a pointer to the desired data, whether
    * that is in the original location or copied to shared memory. */
   template <typename T>
-  __device__ auto* copy(
-    T* source, index_type row_count, index_type col_count, index_type row_pad=index_type{}
-  ) {
-    auto* dest = reinterpret_cast<std::remove_const_t<T>*>(remaining_data);
+  __device__ auto* copy(T* source,
+                        index_type row_count,
+                        index_type col_count,
+                        index_type row_pad = index_type{})
+  {
+    auto* dest        = reinterpret_cast<std::remove_const_t<T>*>(remaining_data);
     auto source_count = row_count * col_count;
-    auto dest_count = row_count * (col_count + row_pad);
+    auto dest_count   = row_count * (col_count + row_pad);
 
     auto copy_data = (dest_count * sizeof(T) <= remaining_size);
 
@@ -52,7 +56,7 @@ struct shared_memory_buffer {
       dest[i + row_pad * (i / col_count)] = source[i];
     }
 
-    auto* result = copy_data ? static_cast<T*>(dest) : source;
+    auto* result  = copy_data ? static_cast<T*>(dest) : source;
     requires_sync = requires_sync || copy_data;
 
     auto offset = dest_count * index_type(sizeof(T));
@@ -67,7 +71,8 @@ struct shared_memory_buffer {
    * desired data, whether that is in the original location or copied to shared
    * memory. */
   template <typename T>
-  __device__ auto* copy(T* source, index_type element_count) {
+  __device__ auto* copy(T* source, index_type element_count)
+  {
     auto* dest = reinterpret_cast<std::remove_const_t<T>*>(remaining_data);
 
     auto copy_data = (element_count * index_type(sizeof(T)) <= remaining_size);
@@ -76,7 +81,7 @@ struct shared_memory_buffer {
     for (auto i = threadIdx.x; i < element_count; i += blockDim.x) {
       dest[i] = source[i];
     }
-    auto* result = copy_data ? static_cast<T*>(dest) : source;
+    auto* result  = copy_data ? static_cast<T*>(dest) : source;
     requires_sync = requires_sync || copy_data;
 
     auto offset = element_count * index_type(sizeof(T));
@@ -90,7 +95,8 @@ struct shared_memory_buffer {
    * there is not enough room, the fill is not performed. Return a pointer to
    * the start of the desired data if the fill was possible or else nullptr. */
   template <typename T>
-  __device__ auto* fill(index_type element_count, T value=T{}, T* fallback_buffer=nullptr) {
+  __device__ auto* fill(index_type element_count, T value = T{}, T* fallback_buffer = nullptr)
+  {
     auto* dest = reinterpret_cast<std::remove_const_t<T>*>(remaining_data);
 
     auto copy_data = (element_count * index_type(sizeof(T)) <= remaining_size);
@@ -100,7 +106,7 @@ struct shared_memory_buffer {
       dest[i] = value;
     }
 
-    auto* result = copy_data ? static_cast<T*>(dest) : fallback_buffer;
+    auto* result  = copy_data ? static_cast<T*>(dest) : fallback_buffer;
     requires_sync = requires_sync || copy_data;
 
     auto offset = element_count * index_type(sizeof(T));
@@ -112,7 +118,8 @@ struct shared_memory_buffer {
 
   /* Clear all stored data and return a pointer to the beginning of available
    * shared memory */
-  __device__ auto* clear() {
+  __device__ auto* clear()
+  {
     remaining_size = total_size;
     remaining_data = data;
     return remaining_data;
@@ -120,7 +127,8 @@ struct shared_memory_buffer {
 
   /* Pad stored data to ensure correct alignment for given type */
   template <typename T>
-  __device__ void align() {
+  __device__ void align()
+  {
     auto pad_required = (total_size - remaining_size) % index_type(sizeof(T));
     remaining_data += pad_required;
     remaining_size -= pad_required;
@@ -128,17 +136,14 @@ struct shared_memory_buffer {
 
   /* If necessary, sync threads. Note that this can cause a deadlock if not all
    * threads call this method. */
-  __device__ void sync() {
-    if (requires_sync) {
-      __syncthreads();
-    }
+  __device__ void sync()
+  {
+    if (requires_sync) { __syncthreads(); }
     requires_sync = false;
   }
 
   /* Return the remaining size in bytes left in this buffer */
-  __device__ auto remaining() {
-    return remaining_size;
-  }
+  __device__ auto remaining() { return remaining_size; }
 
  private:
   std::byte* data;
@@ -148,6 +153,6 @@ struct shared_memory_buffer {
   bool requires_sync;
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/node.hpp b/cpp/include/cuml/experimental/fil/detail/node.hpp
index 21d6e7d611..72ded859df 100644
--- a/cpp/include/cuml/experimental/fil/detail/node.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/node.hpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 #pragma once
-#include <iostream>
-#include <type_traits>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 #include <cuml/experimental/fil/tree_layout.hpp>
+#include <iostream>
+#include <type_traits>
 
 namespace ML {
 namespace experimental {
@@ -29,36 +29,22 @@ namespace detail {
 /*
  * Return the byte size to which a node with the given types should be aligned
  */
-template<typename threshold_t, typename index_t, typename metadata_storage_t, typename offset_t>
-auto constexpr get_node_alignment() {
-  auto total = index_type(
-    std::max(sizeof(threshold_t), sizeof(index_t))
-    + sizeof(metadata_storage_t)
-    + sizeof(offset_t)
-  );
+template <typename threshold_t, typename index_t, typename metadata_storage_t, typename offset_t>
+auto constexpr get_node_alignment()
+{
+  auto total  = index_type(std::max(sizeof(threshold_t), sizeof(index_t)) +
+                          sizeof(metadata_storage_t) + sizeof(offset_t));
   auto result = index_type{8};
-  if (total > result) {
-    result = index_type{16};
-  }
-  if (total > result) {
-    result = index_type{32};
-  }
-  if (total > result) {
-    result = index_type{64};
-  }
-  if (total > result) {
-    result = index_type{128};
-  }
-  if (total > result) {
-    result = index_type{256};
-  }
-  if (total > result) {
-    result = total;
-  }
+  if (total > result) { result = index_type{16}; }
+  if (total > result) { result = index_type{32}; }
+  if (total > result) { result = index_type{64}; }
+  if (total > result) { result = index_type{128}; }
+  if (total > result) { result = index_type{256}; }
+  if (total > result) { result = total; }
   return result;
 }
 
-}
+}  // namespace detail
 
 /* @brief A single node in a forest model
  *
@@ -96,10 +82,13 @@ auto constexpr get_node_alignment() {
  * this node to its most distant child. This type must be large enough to store
  * the largest such offset in the forest model.
  */
-template <tree_layout layout_v, typename threshold_t, typename index_t, typename metadata_storage_t, typename offset_t>
+template <tree_layout layout_v,
+          typename threshold_t,
+          typename index_t,
+          typename metadata_storage_t,
+          typename offset_t>
 struct alignas(
-  detail::get_node_alignment<threshold_t, index_t, metadata_storage_t, offset_t>()
-) node {
+  detail::get_node_alignment<threshold_t, index_t, metadata_storage_t, offset_t>()) node {
   // @brief An alias for layout_v
   auto constexpr static const layout = layout_v;
   // @brief An alias for threshold_t
@@ -127,80 +116,81 @@ struct alignas(
   // one
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnarrowing"
-  HOST DEVICE constexpr node(
-    threshold_type value = threshold_type{},
-    bool is_leaf_node = true,
-    bool default_to_distant_child = false,
-    bool is_categorical_node = false,
-    metadata_storage_type feature = metadata_storage_type{},
-    offset_type distant_child_offset = offset_type{}
-  ) : aligned_data{
-    .inner_data={
-      {.value=value},
-      distant_child_offset,
-      construct_metadata(
-        is_leaf_node, default_to_distant_child, is_categorical_node, feature
-      )
-    }} {}
+  HOST DEVICE constexpr node(threshold_type value             = threshold_type{},
+                             bool is_leaf_node                = true,
+                             bool default_to_distant_child    = false,
+                             bool is_categorical_node         = false,
+                             metadata_storage_type feature    = metadata_storage_type{},
+                             offset_type distant_child_offset = offset_type{})
+    : aligned_data{
+        .inner_data = {
+          {.value = value},
+          distant_child_offset,
+          construct_metadata(is_leaf_node, default_to_distant_child, is_categorical_node, feature)}}
+  {
+  }
 
-  HOST DEVICE constexpr node(
-    index_type index,
-    bool is_leaf_node = true,
-    bool default_to_distant_child = false,
-    bool is_categorical_node = false,
-    metadata_storage_type feature = metadata_storage_type{},
-    offset_type distant_child_offset = offset_type{}
-  ) : aligned_data{
-    .inner_data={
-      {.index=index},
-      distant_child_offset,
-      construct_metadata(
-        is_leaf_node, default_to_distant_child, is_categorical_node, feature
-      )
-    }} {}
+  HOST DEVICE constexpr node(index_type index,
+                             bool is_leaf_node                = true,
+                             bool default_to_distant_child    = false,
+                             bool is_categorical_node         = false,
+                             metadata_storage_type feature    = metadata_storage_type{},
+                             offset_type distant_child_offset = offset_type{})
+    : aligned_data{
+        .inner_data = {
+          {.index = index},
+          distant_child_offset,
+          construct_metadata(is_leaf_node, default_to_distant_child, is_categorical_node, feature)}}
+  {
+  }
 #pragma GCC diagnostic pop
 
   /* The index of the feature for this node */
-  HOST DEVICE auto constexpr feature_index() const {
+  HOST DEVICE auto constexpr feature_index() const
+  {
     return aligned_data.inner_data.metadata & FEATURE_MASK;
   }
   /* Whether or not this node is a leaf node */
-  HOST DEVICE auto constexpr is_leaf() const {
+  HOST DEVICE auto constexpr is_leaf() const
+  {
     return !bool(aligned_data.inner_data.distant_offset);
   }
   /* Whether or not to default to distant child in case of missing values */
-  HOST DEVICE auto constexpr default_distant() const {
+  HOST DEVICE auto constexpr default_distant() const
+  {
     return bool(aligned_data.inner_data.metadata & DEFAULT_DISTANT_MASK);
   }
   /* Whether or not this node is a categorical node */
-  HOST DEVICE auto constexpr is_categorical() const {
+  HOST DEVICE auto constexpr is_categorical() const
+  {
     return bool(aligned_data.inner_data.metadata & CATEGORICAL_MASK);
   }
   /* The offset to the child of this node if it evaluates to given condition */
-  HOST DEVICE auto constexpr child_offset(bool condition) const {
+  HOST DEVICE auto constexpr child_offset(bool condition) const
+  {
     if constexpr (layout == tree_layout::depth_first) {
       return offset_type{1} + condition * (aligned_data.inner_data.distant_offset - offset_type{1});
     } else if constexpr (layout == tree_layout::breadth_first) {
-      return condition * offset_type{1} + (aligned_data.inner_data.distant_offset - offset_type{1});
+      return condition* offset_type{1} + (aligned_data.inner_data.distant_offset - offset_type{1});
     } else {
       static_assert(layout == tree_layout::depth_first);
     }
   }
   /* The threshold value for this node */
-  HOST DEVICE auto constexpr threshold() const {
+  HOST DEVICE auto constexpr threshold() const
+  {
     return aligned_data.inner_data.stored_value.value;
   }
 
   /* The index value for this node */
-  HOST DEVICE auto const& index() const {
-    return aligned_data.inner_data.stored_value.index;
-  }
+  HOST DEVICE auto const& index() const { return aligned_data.inner_data.stored_value.index; }
   /* The output value for this node
    *
    * @tparam output_t The expected output type for this node.
    */
   template <bool has_vector_leaves>
-  HOST DEVICE auto constexpr output() const {
+  HOST DEVICE auto constexpr output() const
+  {
     if constexpr (has_vector_leaves) {
       return aligned_data.inner_data.stored_value.index;
     } else {
@@ -215,36 +205,30 @@ struct alignas(
    * the case of a missing value, and the third tells us whether or not this is
    * a categorical node. The remaining bits indicate the index of the feature
    * for this node */
-  auto constexpr static const LEAF_BIT = metadata_storage_type(
-    index_type(sizeof(metadata_storage_type) * 8 - 1)
-  );
-  auto constexpr static const LEAF_MASK = metadata_storage_type(1 << LEAF_BIT);
+  auto constexpr static const LEAF_BIT =
+    metadata_storage_type(index_type(sizeof(metadata_storage_type) * 8 - 1));
+  auto constexpr static const LEAF_MASK           = metadata_storage_type(1 << LEAF_BIT);
   auto constexpr static const DEFAULT_DISTANT_BIT = metadata_storage_type(LEAF_BIT - 1);
-  auto constexpr static const DEFAULT_DISTANT_MASK = metadata_storage_type(1 << DEFAULT_DISTANT_BIT);
-  auto constexpr static const CATEGORICAL_BIT = metadata_storage_type(DEFAULT_DISTANT_BIT - 1);
+  auto constexpr static const DEFAULT_DISTANT_MASK =
+    metadata_storage_type(1 << DEFAULT_DISTANT_BIT);
+  auto constexpr static const CATEGORICAL_BIT  = metadata_storage_type(DEFAULT_DISTANT_BIT - 1);
   auto constexpr static const CATEGORICAL_MASK = metadata_storage_type(1 << CATEGORICAL_BIT);
-  auto constexpr static const FEATURE_MASK = metadata_storage_type(
-    ~(LEAF_MASK | DEFAULT_DISTANT_MASK | CATEGORICAL_MASK)
-  );
+  auto constexpr static const FEATURE_MASK =
+    metadata_storage_type(~(LEAF_MASK | DEFAULT_DISTANT_MASK | CATEGORICAL_MASK));
 
   // Helper function for bit packing with the above masks
-  auto static constexpr construct_metadata(
-    bool is_leaf_node = true,
-    bool default_to_distant_child = false,
-    bool is_categorical_node = false,
-    metadata_storage_type feature = metadata_storage_type{}
-  ) {
+  auto static constexpr construct_metadata(bool is_leaf_node             = true,
+                                           bool default_to_distant_child = false,
+                                           bool is_categorical_node      = false,
+                                           metadata_storage_type feature = metadata_storage_type{})
+  {
     return metadata_storage_type(
-      (is_leaf_node << LEAF_BIT) +
-      (default_to_distant_child << DEFAULT_DISTANT_BIT) +
-      (is_categorical_node << CATEGORICAL_BIT) +
-      (feature & FEATURE_MASK)
-    );
+      (is_leaf_node << LEAF_BIT) + (default_to_distant_child << DEFAULT_DISTANT_BIT) +
+      (is_categorical_node << CATEGORICAL_BIT) + (feature & FEATURE_MASK));
   }
 
-  auto static constexpr const byte_size = detail::get_node_alignment<
-    threshold_t, index_t, metadata_storage_t, offset_t
-  >();
+  auto static constexpr const byte_size =
+    detail::get_node_alignment<threshold_t, index_t, metadata_storage_t, offset_t>();
 
   struct inner_data_type {
     value_type stored_value;
@@ -261,6 +245,6 @@ struct alignas(
   aligned_data_type aligned_data;
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/postprocessor.hpp b/cpp/include/cuml/experimental/fil/detail/postprocessor.hpp
index c1cd30d8ce..5ea3820f2c 100644
--- a/cpp/include/cuml/experimental/fil/detail/postprocessor.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/postprocessor.hpp
@@ -17,12 +17,12 @@
 #ifndef __CUDACC__
 #include <math.h>
 #endif
-#include <stddef.h>
-#include <limits>
-#include <type_traits>
 #include <cuml/experimental/fil/detail/index_type.hpp>
-#include <cuml/experimental/fil/postproc_ops.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <cuml/experimental/fil/postproc_ops.hpp>
+#include <limits>
+#include <stddef.h>
+#include <type_traits>
 
 namespace ML {
 namespace experimental {
@@ -31,12 +31,11 @@ namespace fil {
 /* Convert the postprocessing operations into a single value
  * representing what must be done in the inference kernel
  */
-  HOST DEVICE inline auto constexpr ops_to_val(row_op row_wise, element_op elem_wise) {
-    return (
-      static_cast<std::underlying_type_t<row_op>>(row_wise) |
-      static_cast<std::underlying_type_t<element_op>>(elem_wise)
-    );
-  }
+HOST DEVICE inline auto constexpr ops_to_val(row_op row_wise, element_op elem_wise)
+{
+  return (static_cast<std::underlying_type_t<row_op>>(row_wise) |
+          static_cast<std::underlying_type_t<element_op>>(elem_wise));
+}
 
 /*
  * Perform postprocessing on raw forest output
@@ -55,204 +54,187 @@ namespace fil {
  * @param constant If the postprocessing operation requires a constant,
  * it can be passed here.
  */
-  template<
-    row_op row_wise_v,
-    element_op elem_wise_v,
-    typename io_t
-  >
-  HOST DEVICE void postprocess(
-    io_t* val,
-    index_type output_count,
-    io_t* out,
-    index_type stride=index_type{1},
-    io_t average_factor=io_t{1},
-    io_t bias=io_t{0},
-    io_t constant=io_t{1}
-  ) {
+template <row_op row_wise_v, element_op elem_wise_v, typename io_t>
+HOST DEVICE void postprocess(io_t* val,
+                             index_type output_count,
+                             io_t* out,
+                             index_type stride   = index_type{1},
+                             io_t average_factor = io_t{1},
+                             io_t bias           = io_t{0},
+                             io_t constant       = io_t{1})
+{
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-    auto max_index = index_type{};
-    auto max_value = std::numeric_limits<io_t>::lowest();
+  auto max_index = index_type{};
+  auto max_value = std::numeric_limits<io_t>::lowest();
 #pragma GCC diagnostic pop
-    for (auto output_index=index_type{}; output_index < output_count; ++output_index) {
-      auto workspace_index = output_index * stride;
-      val[workspace_index] = val[workspace_index] / average_factor + bias;
-      if constexpr (elem_wise_v == element_op::signed_square) {
-        val[workspace_index] = copysign(val[workspace_index] * val[workspace_index], val[workspace_index]);
-      } else if constexpr (elem_wise_v == element_op::hinge) {
-        val[workspace_index] = io_t(val[workspace_index] > io_t{});
-      } else if constexpr (elem_wise_v == element_op::sigmoid) {
-        val[workspace_index] = io_t{1} / (io_t{1} + exp(-constant * val[workspace_index]));
-      } else if constexpr (elem_wise_v == element_op::exponential) {
-        val[workspace_index] = exp(val[workspace_index] / constant);
-      } else if constexpr (elem_wise_v == element_op::logarithm_one_plus_exp) {
-        val[workspace_index] = log1p(exp(val[workspace_index] / constant));
-      }
-      if constexpr (row_wise_v == row_op::softmax || row_wise_v == row_op::max_index) {
-        auto is_new_max = val[workspace_index] > max_value;
-        max_index = is_new_max * output_index + (!is_new_max) * max_index;
-        max_value = is_new_max * val[workspace_index] + (!is_new_max) * max_value;
-      }
+  for (auto output_index = index_type{}; output_index < output_count; ++output_index) {
+    auto workspace_index = output_index * stride;
+    val[workspace_index] = val[workspace_index] / average_factor + bias;
+    if constexpr (elem_wise_v == element_op::signed_square) {
+      val[workspace_index] =
+        copysign(val[workspace_index] * val[workspace_index], val[workspace_index]);
+    } else if constexpr (elem_wise_v == element_op::hinge) {
+      val[workspace_index] = io_t(val[workspace_index] > io_t{});
+    } else if constexpr (elem_wise_v == element_op::sigmoid) {
+      val[workspace_index] = io_t{1} / (io_t{1} + exp(-constant * val[workspace_index]));
+    } else if constexpr (elem_wise_v == element_op::exponential) {
+      val[workspace_index] = exp(val[workspace_index] / constant);
+    } else if constexpr (elem_wise_v == element_op::logarithm_one_plus_exp) {
+      val[workspace_index] = log1p(exp(val[workspace_index] / constant));
     }
+    if constexpr (row_wise_v == row_op::softmax || row_wise_v == row_op::max_index) {
+      auto is_new_max = val[workspace_index] > max_value;
+      max_index       = is_new_max * output_index + (!is_new_max) * max_index;
+      max_value       = is_new_max * val[workspace_index] + (!is_new_max) * max_value;
+    }
+  }
 
-    if constexpr (row_wise_v == row_op::max_index) {
-      *out = max_index;
-    } else {
+  if constexpr (row_wise_v == row_op::max_index) {
+    *out = max_index;
+  } else {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-      auto softmax_normalization = io_t{};
+    auto softmax_normalization = io_t{};
 #pragma GCC diagnostic pop
-      if constexpr (row_wise_v == row_op::softmax) {
-        for (auto workspace_index=index_type{}; workspace_index < output_count * stride; workspace_index += stride) {
-          val[workspace_index] = exp(val[workspace_index] - max_value);
-          softmax_normalization += val[workspace_index];
-        }
+    if constexpr (row_wise_v == row_op::softmax) {
+      for (auto workspace_index = index_type{}; workspace_index < output_count * stride;
+           workspace_index += stride) {
+        val[workspace_index] = exp(val[workspace_index] - max_value);
+        softmax_normalization += val[workspace_index];
       }
+    }
 
-      for (auto output_index=index_type{}; output_index < output_count; ++output_index) {
-        auto workspace_index = output_index * stride;
-        if constexpr (row_wise_v == row_op::softmax) {
-          out[output_index] = val[workspace_index] / softmax_normalization;
-        } else {
-          out[output_index] = val[workspace_index];
-        }
+    for (auto output_index = index_type{}; output_index < output_count; ++output_index) {
+      auto workspace_index = output_index * stride;
+      if constexpr (row_wise_v == row_op::softmax) {
+        out[output_index] = val[workspace_index] / softmax_normalization;
+      } else {
+        out[output_index] = val[workspace_index];
       }
     }
   }
+}
 
-  /*
-   * Struct which holds all data necessary to perform postprocessing on raw
-   * output of a forest model
-   *
-   * @tparam io_t The type used for input and output to/from the model
-   * (typically float/double)
-   * @param row_wise Enum value representing the row-wise post-processing
-   * operation to perform on the output
-   * @param elem_wise Enum value representing the element-wise post-processing
-   * operation to perform on the output
-   * @param average_factor The factor by which to divide during the
-   * normalization step of postprocessing
-   * @param bias The bias factor to subtract off during the
-   * normalization step of postprocessing
-   * @param constant If the postprocessing operation requires a constant,
-   * it can be passed here.
-   */
-  template <typename io_t>
-  struct postprocessor {
-    HOST DEVICE postprocessor(
-      row_op row_wise=row_op::disable,
-      element_op elem_wise=element_op::disable,
-      io_t average_factor=io_t{1},
-      io_t bias=io_t{0},
-      io_t constant=io_t{1}
-    ) :
-      average_factor_{average_factor},
+/*
+ * Struct which holds all data necessary to perform postprocessing on raw
+ * output of a forest model
+ *
+ * @tparam io_t The type used for input and output to/from the model
+ * (typically float/double)
+ * @param row_wise Enum value representing the row-wise post-processing
+ * operation to perform on the output
+ * @param elem_wise Enum value representing the element-wise post-processing
+ * operation to perform on the output
+ * @param average_factor The factor by which to divide during the
+ * normalization step of postprocessing
+ * @param bias The bias factor to subtract off during the
+ * normalization step of postprocessing
+ * @param constant If the postprocessing operation requires a constant,
+ * it can be passed here.
+ */
+template <typename io_t>
+struct postprocessor {
+  HOST DEVICE postprocessor(row_op row_wise      = row_op::disable,
+                            element_op elem_wise = element_op::disable,
+                            io_t average_factor  = io_t{1},
+                            io_t bias            = io_t{0},
+                            io_t constant        = io_t{1})
+    : average_factor_{average_factor},
       bias_{bias},
       constant_{constant},
       row_wise_{row_wise},
-      elem_wise_{elem_wise} {
-    }
+      elem_wise_{elem_wise}
+  {
+  }
 
-    HOST DEVICE void operator()(io_t* val, index_type output_count, io_t* out, index_type stride=index_type{1}) const {
-      switch(ops_to_val(row_wise_, elem_wise_)) {
-        case ops_to_val(row_op::disable, element_op::signed_square):
-          postprocess<row_op::disable, element_op::signed_square>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::disable, element_op::hinge):
-          postprocess<row_op::disable, element_op::hinge>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::disable, element_op::sigmoid):
-          postprocess<row_op::disable, element_op::sigmoid>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::disable, element_op::exponential):
-          postprocess<row_op::disable, element_op::exponential>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::disable, element_op::logarithm_one_plus_exp):
-          postprocess<row_op::disable, element_op::logarithm_one_plus_exp>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::disable):
-          postprocess<row_op::softmax, element_op::disable>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::signed_square):
-          postprocess<row_op::softmax, element_op::signed_square>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::hinge):
-          postprocess<row_op::softmax, element_op::hinge>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::sigmoid):
-          postprocess<row_op::softmax, element_op::sigmoid>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::exponential):
-          postprocess<row_op::softmax, element_op::exponential>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::softmax, element_op::logarithm_one_plus_exp):
-          postprocess<row_op::softmax, element_op::logarithm_one_plus_exp>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::disable):
-          postprocess<row_op::max_index, element_op::disable>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::signed_square):
-          postprocess<row_op::max_index, element_op::signed_square>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::hinge):
-          postprocess<row_op::max_index, element_op::hinge>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::sigmoid):
-          postprocess<row_op::max_index, element_op::sigmoid>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::exponential):
-          postprocess<row_op::max_index, element_op::exponential>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        case ops_to_val(row_op::max_index, element_op::logarithm_one_plus_exp):
-          postprocess<row_op::max_index, element_op::logarithm_one_plus_exp>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-          break;
-        default:
-          postprocess<row_op::disable, element_op::disable>(
-            val, output_count, out, stride, average_factor_, bias_, constant_
-          );
-      }
+  HOST DEVICE void operator()(io_t* val,
+                              index_type output_count,
+                              io_t* out,
+                              index_type stride = index_type{1}) const
+  {
+    switch (ops_to_val(row_wise_, elem_wise_)) {
+      case ops_to_val(row_op::disable, element_op::signed_square):
+        postprocess<row_op::disable, element_op::signed_square>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::disable, element_op::hinge):
+        postprocess<row_op::disable, element_op::hinge>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::disable, element_op::sigmoid):
+        postprocess<row_op::disable, element_op::sigmoid>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::disable, element_op::exponential):
+        postprocess<row_op::disable, element_op::exponential>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::disable, element_op::logarithm_one_plus_exp):
+        postprocess<row_op::disable, element_op::logarithm_one_plus_exp>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::disable):
+        postprocess<row_op::softmax, element_op::disable>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::signed_square):
+        postprocess<row_op::softmax, element_op::signed_square>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::hinge):
+        postprocess<row_op::softmax, element_op::hinge>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::sigmoid):
+        postprocess<row_op::softmax, element_op::sigmoid>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::exponential):
+        postprocess<row_op::softmax, element_op::exponential>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::softmax, element_op::logarithm_one_plus_exp):
+        postprocess<row_op::softmax, element_op::logarithm_one_plus_exp>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::disable):
+        postprocess<row_op::max_index, element_op::disable>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::signed_square):
+        postprocess<row_op::max_index, element_op::signed_square>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::hinge):
+        postprocess<row_op::max_index, element_op::hinge>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::sigmoid):
+        postprocess<row_op::max_index, element_op::sigmoid>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::exponential):
+        postprocess<row_op::max_index, element_op::exponential>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      case ops_to_val(row_op::max_index, element_op::logarithm_one_plus_exp):
+        postprocess<row_op::max_index, element_op::logarithm_one_plus_exp>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
+        break;
+      default:
+        postprocess<row_op::disable, element_op::disable>(
+          val, output_count, out, stride, average_factor_, bias_, constant_);
     }
-   private:
-    io_t average_factor_;
-    io_t bias_;
-    io_t constant_;
-    row_op row_wise_;
-    element_op elem_wise_;
-  };
-}
-}
-}
+  }
+
+ private:
+  io_t average_factor_;
+  io_t bias_;
+  io_t constant_;
+  row_op row_wise_;
+  element_op elem_wise_;
+};
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/buffer.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/buffer.hpp
index bd8ffe5fae..1d42ae4600 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/buffer.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/buffer.hpp
@@ -15,116 +15,104 @@
  */
 #pragma once
 #include <cstddef>
-#include <stdint.h>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <variant>
+#include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/const_agnostic.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/copy.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/exceptions.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <iterator>
+#include <memory>
+#include <stdint.h>
+#include <utility>
+#include <variant>
 
 namespace raft_proto {
 /**
  * @brief A container which may or may not own its own data on host or device
  *
  */
-template<typename T>
+template <typename T>
 struct buffer {
   using index_type = std::size_t;
   using value_type = T;
 
-  using data_store = std::variant<
-    non_owning_buffer<device_type::cpu, T>, non_owning_buffer<device_type::gpu, T>, owning_buffer<device_type::cpu, T>, owning_buffer<device_type::gpu, T>
-  >;
+  using data_store = std::variant<non_owning_buffer<device_type::cpu, T>,
+                                  non_owning_buffer<device_type::gpu, T>,
+                                  owning_buffer<device_type::cpu, T>,
+                                  owning_buffer<device_type::gpu, T>>;
 
   buffer() : device_{}, data_{}, size_{}, cached_ptr{nullptr} {}
 
   /** Construct non-initialized owning buffer */
   buffer(index_type size,
          device_type mem_type = device_type::cpu,
-         int device = 0,
-         cuda_stream stream = 0) 
+         int device           = 0,
+         cuda_stream stream   = 0)
     : device_{[mem_type, &device]() {
-      auto result = device_id_variant{};
-      switch (mem_type) {
-        case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
-        case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
-      }
-      return result;
-    }()},
-    data_{[this, mem_type, size, stream]() {
-      auto result = data_store{};
-      switch (mem_type) {
-        case device_type::cpu:
-          result = owning_buffer<device_type::cpu, T>{size};
-          break;
-        case device_type::gpu:
-          result = owning_buffer<device_type::gpu, T>{std::get<1>(device_), size, stream};
-          break;
-      }
-      return result;
-    }()},
-    size_{size},
-    cached_ptr {[this](){
-      auto result = static_cast<T*>(nullptr);
-      switch(data_.index()) {
-        case 0: result = std::get<0>(data_).get(); break;
-        case 1: result = std::get<1>(data_).get(); break;
-        case 2: result = std::get<2>(data_).get(); break;
-        case 3: result = std::get<3>(data_).get(); break;
-      }
-      return result;
-    }()}
+        auto result = device_id_variant{};
+        switch (mem_type) {
+          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+        }
+        return result;
+      }()},
+      data_{[this, mem_type, size, stream]() {
+        auto result = data_store{};
+        switch (mem_type) {
+          case device_type::cpu: result = owning_buffer<device_type::cpu, T>{size}; break;
+          case device_type::gpu:
+            result = owning_buffer<device_type::gpu, T>{std::get<1>(device_), size, stream};
+            break;
+        }
+        return result;
+      }()},
+      size_{size},
+      cached_ptr{[this]() {
+        auto result = static_cast<T*>(nullptr);
+        switch (data_.index()) {
+          case 0: result = std::get<0>(data_).get(); break;
+          case 1: result = std::get<1>(data_).get(); break;
+          case 2: result = std::get<2>(data_).get(); break;
+          case 3: result = std::get<3>(data_).get(); break;
+        }
+        return result;
+      }()}
   {
   }
 
   /** Construct non-owning buffer */
-  buffer(T* input_data,
-         index_type size,
-         device_type mem_type = device_type::cpu,
-         int device = 0)
+  buffer(T* input_data, index_type size, device_type mem_type = device_type::cpu, int device = 0)
     : device_{[mem_type, &device]() {
-      auto result = device_id_variant{};
-      switch (mem_type) {
-        case device_type::cpu:
-          result = device_id<device_type::cpu>{device};
-          break;
-        case device_type::gpu:
-          result = device_id<device_type::gpu>{device};
-          break;
-      }
-      return result;
-    }()},
-    data_{[this, input_data, mem_type]() {
-      auto result = data_store{};
-      switch (mem_type) {
-        case device_type::cpu:
-          result = non_owning_buffer<device_type::cpu, T>{input_data};
-          break;
-        case device_type::gpu:
-          result = non_owning_buffer<device_type::gpu, T>{input_data};
-          break;
-      }
-      return result;
-    }()},
-    size_{size},
-    cached_ptr {[this](){
-      auto result = static_cast<T*>(nullptr);
-      switch(data_.index()) {
-        case 0: result = std::get<0>(data_).get(); break;
-        case 1: result = std::get<1>(data_).get(); break;
-        case 2: result = std::get<2>(data_).get(); break;
-        case 3: result = std::get<3>(data_).get(); break;
-      }
-      return result;
-    }()}
+        auto result = device_id_variant{};
+        switch (mem_type) {
+          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+        }
+        return result;
+      }()},
+      data_{[this, input_data, mem_type]() {
+        auto result = data_store{};
+        switch (mem_type) {
+          case device_type::cpu: result = non_owning_buffer<device_type::cpu, T>{input_data}; break;
+          case device_type::gpu: result = non_owning_buffer<device_type::gpu, T>{input_data}; break;
+        }
+        return result;
+      }()},
+      size_{size},
+      cached_ptr{[this]() {
+        auto result = static_cast<T*>(nullptr);
+        switch (data_.index()) {
+          case 0: result = std::get<0>(data_).get(); break;
+          case 1: result = std::get<1>(data_).get(); break;
+          case 2: result = std::get<2>(data_).get(); break;
+          case 3: result = std::get<3>(data_).get(); break;
+        }
+        return result;
+      }()}
   {
   }
 
@@ -134,45 +122,44 @@ struct buffer {
    * A buffer constructed in this way is owning and will copy the data from
    * the original location
    */
-  buffer(buffer<T> const& other, device_type mem_type, int device = 0, cuda_stream stream=cuda_stream{})
+  buffer(buffer<T> const& other,
+         device_type mem_type,
+         int device         = 0,
+         cuda_stream stream = cuda_stream{})
     : device_{[mem_type, &device]() {
-      auto result = device_id_variant{};
-      switch (mem_type) {
-        case device_type::cpu:
-          result = device_id<device_type::cpu>{device};
-          break;
-        case device_type::gpu:
-          result = device_id<device_type::gpu>{device};
-          break;
-      }
-      return result;
-    }()},
-    data_{[this, &other, mem_type, device, stream]() {
-      auto result = data_store{};
-      auto result_data = static_cast<T*>(nullptr);
-      if (mem_type == device_type::cpu) {
-        auto buf = owning_buffer<device_type::cpu, T>(other.size());
-        result_data = buf.get();
-        result = std::move(buf);
-      } else if (mem_type==device_type::gpu) {
-        auto buf = owning_buffer<device_type::gpu, T>(std::get<1>(device_), other.size(), stream);
-        result_data = buf.get();
-        result = std::move(buf);
-      }
-      copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream);
-      return result;
-    }()},
-    size_{other.size()},
-    cached_ptr {[this](){
-      auto result = static_cast<T*>(nullptr);
-      switch(data_.index()) {
-        case 0: result = std::get<0>(data_).get(); break;
-        case 1: result = std::get<1>(data_).get(); break;
-        case 2: result = std::get<2>(data_).get(); break;
-        case 3: result = std::get<3>(data_).get(); break;
-      }
-      return result;
-    }()}
+        auto result = device_id_variant{};
+        switch (mem_type) {
+          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+        }
+        return result;
+      }()},
+      data_{[this, &other, mem_type, device, stream]() {
+        auto result      = data_store{};
+        auto result_data = static_cast<T*>(nullptr);
+        if (mem_type == device_type::cpu) {
+          auto buf    = owning_buffer<device_type::cpu, T>(other.size());
+          result_data = buf.get();
+          result      = std::move(buf);
+        } else if (mem_type == device_type::gpu) {
+          auto buf = owning_buffer<device_type::gpu, T>(std::get<1>(device_), other.size(), stream);
+          result_data = buf.get();
+          result      = std::move(buf);
+        }
+        copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream);
+        return result;
+      }()},
+      size_{other.size()},
+      cached_ptr{[this]() {
+        auto result = static_cast<T*>(nullptr);
+        switch (data_.index()) {
+          case 0: result = std::get<0>(data_).get(); break;
+          case 1: result = std::get<1>(data_).get(); break;
+          case 2: result = std::get<2>(data_).get(); break;
+          case 3: result = std::get<3>(data_).get(); break;
+        }
+        return result;
+      }()}
   {
   }
 
@@ -181,14 +168,16 @@ struct buffer {
    * The memory type of this new buffer will be the same as the original
    */
   buffer(buffer<T> const& other) : buffer(other, other.memory_type(), other.device_index()) {}
-  friend void swap(buffer<T>& first, buffer<T>& second) {
+  friend void swap(buffer<T>& first, buffer<T>& second)
+  {
     using std::swap;
     swap(first.device_, second.device_);
     swap(first.data_, second.data_);
     swap(first.size_, second.size_);
     swap(first.cached_ptr, second.cached_ptr);
   }
-  buffer<T>& operator=(buffer<T> other) {
+  buffer<T>& operator=(buffer<T> other)
+  {
     swap(*this, other);
     return *this;
   }
@@ -197,7 +186,10 @@ struct buffer {
    * @brief Create owning copy of existing buffer with given stream
    * The memory type of this new buffer will be the same as the original
    */
-  buffer(buffer<T> const& other, cuda_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) {}
+  buffer(buffer<T> const& other, cuda_stream stream)
+    : buffer(other, other.memory_type(), other.device_index(), stream)
+  {
+  }
 
   /**
    * @brief Move from existing buffer unless a copy is necessary based on
@@ -205,47 +197,43 @@ struct buffer {
    */
   buffer(buffer<T>&& other, device_type mem_type, int device, cuda_stream stream)
     : device_{[mem_type, &device]() {
-      auto result = device_id_variant{};
-      switch (mem_type) {
-        case device_type::cpu:
-          result = device_id<device_type::cpu>{device};
-          break;
-        case device_type::gpu:
-          result = device_id<device_type::gpu>{device};
-          break;
-      }
-      return result;
-    }()},
-    data_{[&other, mem_type, device, stream]() {
-      auto result = data_store{};
-      if (mem_type == other.memory_type() && device == other.device_index()) {
-        result  = std::move(other.data_);
-      } else {
-        auto* result_data = static_cast<T*>(nullptr);
-        if (mem_type == device_type::cpu) {
-          auto buf = owning_buffer<device_type::cpu, T>{other.size()};
-          result_data = buf.get();
-          result = std::move(buf);
-        } else if (mem_type == device_type::gpu) {
-          auto buf = owning_buffer<device_type::gpu, T>{device, other.size(), stream};
-          result_data = buf.get();
-          result = std::move(buf);
+        auto result = device_id_variant{};
+        switch (mem_type) {
+          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
         }
-        copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream);
-      }
-      return result;
-    }()},
-    size_{other.size()},
-    cached_ptr {[this](){
-      auto result = static_cast<T*>(nullptr);
-      switch(data_.index()) {
-        case 0: result = std::get<0>(data_).get(); break;
-        case 1: result = std::get<1>(data_).get(); break;
-        case 2: result = std::get<2>(data_).get(); break;
-        case 3: result = std::get<3>(data_).get(); break;
-      }
-      return result;
-    }()}
+        return result;
+      }()},
+      data_{[&other, mem_type, device, stream]() {
+        auto result = data_store{};
+        if (mem_type == other.memory_type() && device == other.device_index()) {
+          result = std::move(other.data_);
+        } else {
+          auto* result_data = static_cast<T*>(nullptr);
+          if (mem_type == device_type::cpu) {
+            auto buf    = owning_buffer<device_type::cpu, T>{other.size()};
+            result_data = buf.get();
+            result      = std::move(buf);
+          } else if (mem_type == device_type::gpu) {
+            auto buf    = owning_buffer<device_type::gpu, T>{device, other.size(), stream};
+            result_data = buf.get();
+            result      = std::move(buf);
+          }
+          copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream);
+        }
+        return result;
+      }()},
+      size_{other.size()},
+      cached_ptr{[this]() {
+        auto result = static_cast<T*>(nullptr);
+        switch (data_.index()) {
+          case 0: result = std::get<0>(data_).get(); break;
+          case 1: result = std::get<1>(data_).get(); break;
+          case 2: result = std::get<2>(data_).get(); break;
+          case 3: result = std::get<3>(data_).get(); break;
+        }
+        return result;
+      }()}
   {
   }
   buffer(buffer<T>&& other, device_type mem_type, int device)
@@ -257,40 +245,42 @@ struct buffer {
   {
   }
 
-  buffer(buffer<T>&& other) : buffer<T>{} {
-    swap(*this, other);
-  }
+  buffer(buffer<T>&& other) : buffer<T>{} { swap(*this, other); }
 
   template <
     typename iter_t,
-    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())
-  >
+    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())>
   buffer(iter_t const& begin, iter_t const& end)
     : buffer{static_cast<size_t>(std::distance(begin, end))}
   {
     auto index = std::size_t{};
-    std::for_each(begin, end, [&index, this](auto&& val) {
-        data()[index++] = val;
-    });
+    std::for_each(begin, end, [&index, this](auto&& val) { data()[index++] = val; });
   }
 
   template <
     typename iter_t,
-    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())
-  >
-  buffer(iter_t const& begin, iter_t const& end, device_type mem_type) : buffer{buffer{begin, end}, mem_type} { }
+    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())>
+  buffer(iter_t const& begin, iter_t const& end, device_type mem_type)
+    : buffer{buffer{begin, end}, mem_type}
+  {
+  }
 
   template <
     typename iter_t,
-    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())
-  >
-  buffer(iter_t const& begin, iter_t const& end, device_type mem_type, int device, cuda_stream stream=cuda_stream{}) : buffer{buffer{begin, end}, mem_type, device, stream} { }
+    typename = decltype(*std::declval<iter_t&>(), void(), ++std::declval<iter_t&>(), void())>
+  buffer(iter_t const& begin,
+         iter_t const& end,
+         device_type mem_type,
+         int device,
+         cuda_stream stream = cuda_stream{})
+    : buffer{buffer{begin, end}, mem_type, device, stream}
+  {
+  }
 
   auto size() const noexcept { return size_; }
-  HOST DEVICE auto* data() const noexcept {
-    return cached_ptr;
-  }
-  auto memory_type() const noexcept {
+  HOST DEVICE auto* data() const noexcept { return cached_ptr; }
+  auto memory_type() const noexcept
+  {
     auto result = device_type{};
     if (device_.index() == 0) {
       result = device_type::cpu;
@@ -300,13 +290,12 @@ struct buffer {
     return result;
   }
 
-  auto device() const noexcept {
-    return device_;
-  }
+  auto device() const noexcept { return device_; }
 
-  auto device_index() const noexcept {
+  auto device_index() const noexcept
+  {
     auto result = int{};
-    switch(device_.index()) {
+    switch (device_.index()) {
       case 0: result = std::get<0>(device_).value(); break;
       case 1: result = std::get<1>(device_).value(); break;
     }
@@ -321,46 +310,76 @@ struct buffer {
   T* cached_ptr;
 };
 
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src, typename buffer<T>::index_type dst_offset, typename buffer<U>::index_type src_offset, typename buffer<T>::index_type size, cuda_stream stream) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>& dst,
+                                 buffer<U> const& src,
+                                 typename buffer<T>::index_type dst_offset,
+                                 typename buffer<U>::index_type src_offset,
+                                 typename buffer<T>::index_type size,
+                                 cuda_stream stream)
+{
   if constexpr (bounds_check) {
     if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
       throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
     }
   }
-  copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream);
+  copy(dst.data() + dst_offset,
+       src.data() + src_offset,
+       size,
+       dst.memory_type(),
+       src.memory_type(),
+       stream);
 }
 
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src, cuda_stream stream) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src, cuda_stream stream)
+{
   copy<bounds_check>(dst, src, 0, 0, src.size(), stream);
 }
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src)
+{
   copy<bounds_check>(dst, src, 0, 0, src.size(), cuda_stream{});
 }
 
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, typename buffer<T>::index_type dst_offset, typename buffer<U>::index_type src_offset, typename buffer<T>::index_type size, cuda_stream stream) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
+                                 buffer<U>&& src,
+                                 typename buffer<T>::index_type dst_offset,
+                                 typename buffer<U>::index_type src_offset,
+                                 typename buffer<T>::index_type size,
+                                 cuda_stream stream)
+{
   if constexpr (bounds_check) {
     if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
       throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
     }
   }
-  copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream);
+  copy(dst.data() + dst_offset,
+       src.data() + src_offset,
+       size,
+       dst.memory_type(),
+       src.memory_type(),
+       stream);
 }
 
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, typename buffer<T>::index_type dst_offset, cuda_stream stream) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
+                                 buffer<U>&& src,
+                                 typename buffer<T>::index_type dst_offset,
+                                 cuda_stream stream)
+{
   copy<bounds_check>(dst, src, dst_offset, 0, src.size(), stream);
 }
 
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, cuda_stream stream) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, cuda_stream stream)
+{
   copy<bounds_check>(dst, src, 0, 0, src.size(), stream);
 }
-template<bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src) {
+template <bool bounds_check, typename T, typename U>
+const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src)
+{
   copy<bounds_check>(dst, src, 0, 0, src.size(), cuda_stream{});
 }
 
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/ceildiv.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/ceildiv.hpp
index fbea3d7b2c..33c68243ba 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/ceildiv.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/ceildiv.hpp
@@ -18,7 +18,8 @@
 
 namespace raft_proto {
 template <typename T, typename U>
-HOST DEVICE auto constexpr ceildiv(T dividend, U divisor) {
+HOST DEVICE auto constexpr ceildiv(T dividend, U divisor)
+{
   return (dividend + divisor - T{1}) / divisor;
 }
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_check.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_check.hpp
index bcc33c9e9e..06a44c540d 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_check.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_check.hpp
@@ -23,7 +23,8 @@
 
 namespace raft_proto {
 template <typename error_t>
-void cuda_check(error_t const& err) noexcept(!GPU_ENABLED) {
+void cuda_check(error_t const& err) noexcept(!GPU_ENABLED)
+{
   detail::cuda_check<device_type::gpu>(err);
 }
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp
index 07b8d86c40..ca61251b34 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp
@@ -24,9 +24,10 @@ using cuda_stream = cudaStream_t;
 #else
 using cuda_stream = int;
 #endif
-inline void synchronize(cuda_stream stream) {
+inline void synchronize(cuda_stream stream)
+{
 #ifdef CUML_ENABLE_GPU
   cudaStreamSynchronize(stream);
 #endif
 }
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/const_agnostic.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/const_agnostic.hpp
index cf2859b5df..d12e8c3f9c 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/const_agnostic.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/const_agnostic.hpp
@@ -24,4 +24,4 @@ using const_agnostic_same_t =
 template <typename T, typename U>
 inline constexpr auto const_agnostic_same_v =
   std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>;
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy.hpp
index d7a9144798..04a530e341 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy.hpp
@@ -14,56 +14,80 @@
  * limitations under the License.
  */
 #pragma once
-#include <stdint.h>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/copy/cpu.hpp>
+#include <stdint.h>
 #ifdef CUML_ENABLE_GPU
 #include <cuml/experimental/fil/detail/raft_proto/detail/copy/gpu.hpp>
 #endif
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 
 namespace raft_proto {
-template<device_type dst_type, device_type src_type, typename T>
-void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) {
+template <device_type dst_type, device_type src_type, typename T>
+void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset)
+{
   detail::copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, cuda_stream{});
 }
 
-template<device_type dst_type, device_type src_type, typename T>
-void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream) {
+template <device_type dst_type, device_type src_type, typename T>
+void copy(
+  T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream)
+{
   detail::copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, stream);
 }
 
-template<device_type dst_type, device_type src_type, typename T>
-void copy(T* dst, T const* src, uint32_t size) {
+template <device_type dst_type, device_type src_type, typename T>
+void copy(T* dst, T const* src, uint32_t size)
+{
   detail::copy<dst_type, src_type, T>(dst, src, size, cuda_stream{});
 }
 
-template<device_type dst_type, device_type src_type, typename T>
-void copy(T* dst, T const* src, uint32_t size, cuda_stream stream) {
+template <device_type dst_type, device_type src_type, typename T>
+void copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
+{
   detail::copy<dst_type, src_type, T>(dst, src, size, stream);
 }
 
-template<typename T>
-void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream) {
+template <typename T>
+void copy(T* dst,
+          T const* src,
+          uint32_t size,
+          device_type dst_type,
+          device_type src_type,
+          uint32_t dst_offset,
+          uint32_t src_offset,
+          cuda_stream stream)
+{
   if (dst_type == device_type::gpu && src_type == device_type::gpu) {
-    detail::copy<device_type::gpu, device_type::gpu, T>(dst + dst_offset, src + src_offset, size, stream);
+    detail::copy<device_type::gpu, device_type::gpu, T>(
+      dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::cpu && src_type == device_type::cpu) {
-    detail::copy<device_type::cpu, device_type::cpu, T>(dst + dst_offset, src + src_offset, size, stream);
+    detail::copy<device_type::cpu, device_type::cpu, T>(
+      dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::gpu && src_type == device_type::cpu) {
-    detail::copy<device_type::gpu, device_type::cpu, T>(dst + dst_offset, src + src_offset, size, stream);
+    detail::copy<device_type::gpu, device_type::cpu, T>(
+      dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::cpu && src_type == device_type::gpu) {
-    detail::copy<device_type::cpu, device_type::gpu, T>(dst + dst_offset, src + src_offset, size, stream);
+    detail::copy<device_type::cpu, device_type::gpu, T>(
+      dst + dst_offset, src + src_offset, size, stream);
   }
 }
 
-template<typename T>
-void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) {
+template <typename T>
+void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type)
+{
   copy<T>(dst, src, size, dst_type, src_type, 0, 0, cuda_stream{});
 }
 
-template<typename T>
-void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, cuda_stream stream) {
+template <typename T>
+void copy(T* dst,
+          T const* src,
+          uint32_t size,
+          device_type dst_type,
+          device_type src_type,
+          cuda_stream stream)
+{
   copy<T>(dst, src, size, dst_type, src_type, 0, 0, stream);
 }
 
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/cpu.hpp
index e49e0b7e4c..1766c044ba 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/cpu.hpp
@@ -14,25 +14,35 @@
  * limitations under the License.
  */
 #pragma once
-#include <stdint.h>
 #include <algorithm>
 #include <cstring>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <stdint.h>
 
 namespace raft_proto {
 namespace detail {
 
-template<device_type dst_type, device_type src_type, typename T>
-std::enable_if_t<std::conjunction_v<std::bool_constant<dst_type == device_type::cpu>, std::bool_constant<src_type == device_type::cpu>>, void> copy(T* dst, T const* src, uint32_t size, cuda_stream stream) {
+template <device_type dst_type, device_type src_type, typename T>
+std::enable_if_t<std::conjunction_v<std::bool_constant<dst_type == device_type::cpu>,
+                                    std::bool_constant<src_type == device_type::cpu>>,
+                 void>
+copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
+{
   std::copy(src, src + size, dst);
 }
 
-template<device_type dst_type, device_type src_type, typename T>
-std::enable_if_t<std::conjunction_v<std::disjunction<std::bool_constant<dst_type != device_type::cpu>, std::bool_constant<src_type != device_type::cpu>>, std::bool_constant<!GPU_ENABLED>>, void> copy(T* dst, T const* src, uint32_t size, cuda_stream stream) {
+template <device_type dst_type, device_type src_type, typename T>
+std::enable_if_t<
+  std::conjunction_v<std::disjunction<std::bool_constant<dst_type != device_type::cpu>,
+                                      std::bool_constant<src_type != device_type::cpu>>,
+                     std::bool_constant<!GPU_ENABLED>>,
+  void>
+copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
+{
   throw gpu_unsupported("Copying from or to device in non-GPU build");
 }
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/gpu.hpp
index 56ea1d0f6a..eeeecca77f 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/copy/gpu.hpp
@@ -15,19 +15,25 @@
  */
 #pragma once
 #include <cuda_runtime_api.h>
-#include <stdint.h>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_check.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
+#include <stdint.h>
 #include <type_traits>
 
 namespace raft_proto {
 namespace detail {
 
-template<device_type dst_type, device_type src_type, typename T>
-std::enable_if_t<std::conjunction_v<std::disjunction<std::bool_constant<dst_type == device_type::gpu>, std::bool_constant<src_type == device_type::gpu>>, std::bool_constant<GPU_ENABLED>>, void> copy(T* dst, T const* src, uint32_t size, cuda_stream stream) {
+template <device_type dst_type, device_type src_type, typename T>
+std::enable_if_t<
+  std::conjunction_v<std::disjunction<std::bool_constant<dst_type == device_type::gpu>,
+                                      std::bool_constant<src_type == device_type::gpu>>,
+                     std::bool_constant<GPU_ENABLED>>,
+  void>
+copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
+{
   raft_proto::cuda_check(cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream));
 }
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/base.hpp
index a85e07d8e7..37129fe94e 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/base.hpp
@@ -20,8 +20,9 @@ namespace raft_proto {
 namespace detail {
 
 template <device_type D, typename error_t>
-void cuda_check(error_t const& err) {
+void cuda_check(error_t const& err)
+{
 }
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/gpu.hpp
index b5a5c0a77d..66b19d7e20 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/cuda_check/gpu.hpp
@@ -22,12 +22,13 @@ namespace raft_proto {
 namespace detail {
 
 template <>
-inline void cuda_check<device_type::gpu, cudaError_t>(cudaError_t const& err) noexcept(false) {
+inline void cuda_check<device_type::gpu, cudaError_t>(cudaError_t const& err) noexcept(false)
+{
   if (err != cudaSuccess) {
     cudaGetLastError();
     throw bad_cuda_call(cudaGetErrorString(err));
   }
 }
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/base.hpp
index e66dac1cdc..03699f2349 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/base.hpp
@@ -18,12 +18,12 @@
 
 namespace raft_proto {
 namespace detail {
-template<device_type D>
+template <device_type D>
 struct device_id {
   using value_type = int;
 
   device_id(value_type device_index) {}
   auto value() const { return value_type{}; }
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/cpu.hpp
index 22f2ea7c10..d3f2d944a9 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/cpu.hpp
@@ -26,8 +26,9 @@ struct device_id<device_type::cpu> {
   device_id(value_type dev_id) : id_{dev_id} {};
 
   auto value() const noexcept { return id_; }
+
  private:
   value_type id_;
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/gpu.hpp
index c3c1c82ba5..760eb5f71b 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_id/gpu.hpp
@@ -21,19 +21,21 @@
 
 namespace raft_proto {
 namespace detail {
-template<>
+template <>
 struct device_id<device_type::gpu> {
   using value_type = typename rmm::cuda_device_id::value_type;
-  device_id() noexcept(false) : id_{[](){
-    auto raw_id = value_type{};
-    raft_proto::cuda_check(cudaGetDevice(&raw_id));
-    return raw_id;
-  }()} {};
+  device_id() noexcept(false)
+    : id_{[]() {
+        auto raw_id = value_type{};
+        raft_proto::cuda_check(cudaGetDevice(&raw_id));
+        return raw_id;
+      }()} {};
   device_id(value_type dev_id) noexcept : id_{dev_id} {};
 
   auto value() const noexcept { return id_.value(); }
+
  private:
   rmm::cuda_device_id id_;
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/base.hpp
index 013b9464ca..82303e1db0 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/base.hpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #pragma once
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 
 namespace raft_proto {
 namespace detail {
@@ -26,5 +26,5 @@ struct device_setter {
   device_setter(device_id<D> device) {}
 };
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/gpu.hpp
index ef4e1dda43..71366c55e0 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/device_setter/gpu.hpp
@@ -17,8 +17,8 @@
 #include <cuda_runtime_api.h>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_check.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/device_setter/base.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft_proto {
@@ -27,20 +27,21 @@ namespace detail {
 /** Struct for setting current device within a code block */
 template <>
 struct device_setter<device_type::gpu> {
-  device_setter(raft_proto::device_id<device_type::gpu> device) noexcept(false) : prev_device_{[]() {
-    auto result = int{};
-    raft_proto::cuda_check(cudaGetDevice(&result));
-    return result;
-  }()} {
+  device_setter(raft_proto::device_id<device_type::gpu> device) noexcept(false)
+    : prev_device_{[]() {
+        auto result = int{};
+        raft_proto::cuda_check(cudaGetDevice(&result));
+        return result;
+      }()}
+  {
     raft_proto::cuda_check(cudaSetDevice(device.value()));
   }
 
-  ~device_setter() {
-    RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value()));
-  }
+  ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); }
+
  private:
   device_id<device_type::gpu> prev_device_;
 };
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw.hpp
index 1ab15770b9..a1db8e3f09 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw.hpp
@@ -19,6 +19,6 @@
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 
 namespace raft_proto {
-template<typename T, bool host=!GPU_COMPILATION>
+template <typename T, bool host = !GPU_COMPILATION>
 using host_only_throw = detail::host_only_throw<T, host>;
 }
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/base.hpp
index 4796339102..348458647a 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/base.hpp
@@ -18,12 +18,13 @@
 
 namespace raft_proto {
 namespace detail {
-template<typename T, bool host>
+template <typename T, bool host>
 struct host_only_throw {
   template <typename... Args>
-  host_only_throw(Args&&... args) {
+  host_only_throw(Args&&... args)
+  {
     static_assert(host);  // Do not allow constexpr branch to compile if !host
   }
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/cpu.hpp
index cab34f8c2d..6f0d121173 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/host_only_throw/cpu.hpp
@@ -19,12 +19,13 @@
 
 namespace raft_proto {
 namespace detail {
-template<typename T>
-struct host_only_throw<T, true>{
+template <typename T>
+struct host_only_throw<T, true> {
   template <typename... Args>
-  host_only_throw(Args&&... args) noexcept(false)  {
+  host_only_throw(Args&&... args) noexcept(false)
+  {
     throw T{std::forward<Args>(args)...};
   }
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer.hpp
index 9d3cf99273..fc42f6f606 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer.hpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 #pragma once
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer/base.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 
 namespace raft_proto {
-template<device_type D, typename T>
+template <device_type D, typename T>
 using non_owning_buffer = detail::non_owning_buffer<D, T>;
 }
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer/base.hpp
index 141dc2faac..67e1a92699 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/non_owning_buffer/base.hpp
@@ -20,13 +20,13 @@
 
 namespace raft_proto {
 namespace detail {
-template<device_type D, typename T>
+template <device_type D, typename T>
 struct non_owning_buffer {
   // TODO(wphicks): Assess need for buffers of const T
   using value_type = std::remove_const_t<T>;
-  non_owning_buffer() : data_{nullptr} { }
+  non_owning_buffer() : data_{nullptr} {}
 
-  non_owning_buffer(T* ptr) : data_{ptr} { }
+  non_owning_buffer(T* ptr) : data_{ptr} {}
 
   auto* get() const { return data_; }
 
@@ -34,6 +34,5 @@ struct non_owning_buffer {
   // TODO(wphicks): Back this with RMM-allocated host memory
   T* data_;
 };
-}
-}
-
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer.hpp
index 2ce82081e6..cdd33a8c06 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer.hpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 #pragma once
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/cpu.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #ifdef CUML_ENABLE_GPU
 #include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/gpu.hpp>
 #endif
 namespace raft_proto {
-template<device_type D, typename T>
+template <device_type D, typename T>
 using owning_buffer = detail::owning_buffer<D, T>;
 }
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp
index 984141b8de..b5c377f2ec 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp
@@ -22,12 +22,12 @@
 namespace raft_proto {
 namespace detail {
 
-template<device_type D, typename T>
+template <device_type D, typename T>
 struct owning_buffer {
   owning_buffer() {}
   owning_buffer(device_id<D> device_id, std::size_t size, cuda_stream stream) {}
   auto* get() const { return static_cast<T*>(nullptr); }
 };
 
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/cpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/cpu.hpp
index 8dea0603b3..056c790def 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/cpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/cpu.hpp
@@ -14,28 +14,22 @@
  * limitations under the License.
  */
 #pragma once
+#include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp>
 #include <memory>
 #include <type_traits>
 
 namespace raft_proto {
 namespace detail {
-template<typename T>
+template <typename T>
 struct owning_buffer<device_type::cpu, T> {
   // TODO(wphicks): Assess need for buffers of const T
   using value_type = std::remove_const_t<T>;
 
-  owning_buffer()
-    : data_{std::unique_ptr<T[]>{nullptr}}
-  {
-  }
+  owning_buffer() : data_{std::unique_ptr<T[]>{nullptr}} {}
 
-  owning_buffer(std::size_t size)
-    : data_{std::make_unique<T[]>(size)}
-  {
-  }
+  owning_buffer(std::size_t size) : data_{std::make_unique<T[]>(size)} {}
 
   auto* get() const { return data_.get(); }
 
@@ -43,6 +37,5 @@ struct owning_buffer<device_type::cpu, T> {
   // TODO(wphicks): Back this with RMM-allocated host memory
   std::unique_ptr<T[]> data_;
 };
-}
-}
-
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/gpu.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/gpu.hpp
index 9841bc47c1..80bf7e46b6 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/gpu.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/gpu.hpp
@@ -15,26 +15,28 @@
  */
 #pragma once
 #include <cuda_runtime_api.h>
+#include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_setter.hpp>
-#include <cuml/experimental/fil/detail/raft_proto/detail/owning_buffer/base.hpp>
+#include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
 #include <rmm/device_buffer.hpp>
 #include <type_traits>
 
 namespace raft_proto {
 namespace detail {
-template<typename T>
+template <typename T>
 struct owning_buffer<device_type::gpu, T> {
   // TODO(wphicks): Assess need for buffers of const T
   using value_type = std::remove_const_t<T>;
   owning_buffer() : data_{} {}
 
-  owning_buffer(device_id<device_type::gpu> device_id, std::size_t size, cudaStream_t stream) noexcept(false)
+  owning_buffer(device_id<device_type::gpu> device_id,
+                std::size_t size,
+                cudaStream_t stream) noexcept(false)
     : data_{[&device_id, &size, &stream]() {
-      auto device_context = device_setter{device_id};
-      return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}};
-    }()}
+        auto device_context = device_setter{device_id};
+        return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}};
+      }()}
   {
   }
 
@@ -43,5 +45,5 @@ struct owning_buffer<device_type::gpu, T> {
  private:
   mutable rmm::device_buffer data_;
 };
-}
-}
+}  // namespace detail
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/device_id.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/device_id.hpp
index f021ad6a95..c74ad239a4 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/device_id.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/device_id.hpp
@@ -28,4 +28,4 @@ template <device_type D>
 using device_id = detail::device_id<D>;
 
 using device_id_variant = std::variant<device_id<device_type::cpu>, device_id<device_type::gpu>>;
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/device_type.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/device_type.hpp
index 24ed9a629b..b30ad6b1bf 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/device_type.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/device_type.hpp
@@ -15,8 +15,5 @@
  */
 #pragma once
 namespace raft_proto {
-enum class device_type {
-  cpu,
-  gpu
-};
+enum class device_type { cpu, gpu };
 }
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/exceptions.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/exceptions.hpp
index 0b3128c2b2..54415bb7ea 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/exceptions.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/exceptions.hpp
@@ -36,9 +36,9 @@ struct out_of_bounds : std::exception {
 };
 
 struct wrong_device_type : std::exception {
-  wrong_device_type() : wrong_device_type(
-    "Attempted to use host data on GPU or device data on CPU"
-  ) {}
+  wrong_device_type() : wrong_device_type("Attempted to use host data on GPU or device data on CPU")
+  {
+  }
   wrong_device_type(char const* msg) : msg_{msg} {}
   virtual char const* what() const noexcept { return msg_; }
 
@@ -47,9 +47,7 @@ struct wrong_device_type : std::exception {
 };
 
 struct mem_type_mismatch : std::exception {
-  mem_type_mismatch() : mem_type_mismatch(
-    "Memory type does not match expected type"
-  ) {}
+  mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {}
   mem_type_mismatch(char const* msg) : msg_{msg} {}
   virtual char const* what() const noexcept { return msg_; }
 
@@ -58,9 +56,7 @@ struct mem_type_mismatch : std::exception {
 };
 
 struct wrong_device : std::exception {
-  wrong_device() : wrong_device(
-    "Attempted to use incorrect device"
-  ) {}
+  wrong_device() : wrong_device("Attempted to use incorrect device") {}
   wrong_device(char const* msg) : msg_{msg} {}
   virtual char const* what() const noexcept { return msg_; }
 
@@ -68,4 +64,4 @@ struct wrong_device : std::exception {
   char const* msg_;
 };
 
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/gpu_support.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/gpu_support.hpp
index 66df240aff..159d1fa80e 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/gpu_support.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/gpu_support.hpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 #pragma once
-#include <exception>
 #include <cstddef>
+#include <exception>
 #include <stdint.h>
 
 namespace raft_proto {
 #ifdef CUML_ENABLE_GPU
 auto constexpr static const GPU_ENABLED = true;
 #else
-auto constexpr static const GPU_ENABLED = false;
+auto constexpr static const GPU_ENABLED     = false;
 #endif
 
 #ifdef __CUDACC__
-#define HOST __host__
+#define HOST   __host__
 #define DEVICE __device__
 auto constexpr static const GPU_COMPILATION = true;
 #else
@@ -38,7 +38,7 @@ auto constexpr static const GPU_COMPILATION = false;
 #ifndef DEBUG
 auto constexpr static const DEBUG_ENABLED = false;
 #elif DEBUG == 0
-auto constexpr static const DEBUG_ENABLED = false;
+auto constexpr static const DEBUG_ENABLED   = false;
 #else
 auto constexpr static const DEBUG_ENABLED = true;
 #endif
@@ -52,4 +52,4 @@ struct gpu_unsupported : std::exception {
   char const* msg_;
 };
 
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/handle.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/handle.hpp
index bae0b4a695..f0fe9bd81f 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/handle.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/handle.hpp
@@ -24,39 +24,30 @@
 namespace raft_proto {
 #ifdef CUML_ENABLE_GPU
 struct handle_t {
-  handle_t(raft::handle_t const* handle_ptr=nullptr) : raft_handle_{handle_ptr} {}
+  handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {}
   handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {}
-  auto get_next_usable_stream() const {
+  auto get_next_usable_stream() const
+  {
     return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()};
   }
-  auto get_stream_pool_size() const {
-    return raft_handle_->get_stream_pool_size();
-  }
-  auto get_usable_stream_count() const {
-    return std::max(get_stream_pool_size(), std::size_t{1});
-  }
-  void synchronize() const {
+  auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); }
+  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
+  void synchronize() const
+  {
     raft_handle_->sync_stream_pool();
     raft_handle_->sync_stream();
   }
+
  private:
   // Have to store a pointer because handle is not movable
   raft::handle_t const* raft_handle_;
 };
 #else
 struct handle_t {
-  auto get_next_usable_stream() const {
-    return raft_proto::cuda_stream{};
-  }
-  auto get_stream_pool_size() const {
-    return std::size_t{};
-  }
-  auto get_usable_stream_count() const {
-    return std::max(get_stream_pool_size(), std::size_t{1});
-  }
-  void synchronize() const {
-  }
+  auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; }
+  auto get_stream_pool_size() const { return std::size_t{}; }
+  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
+  void synchronize() const {}
 };
 #endif
-}
-
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/raft_proto/padding.hpp b/cpp/include/cuml/experimental/fil/detail/raft_proto/padding.hpp
index 4ca1229105..82192c2d75 100644
--- a/cpp/include/cuml/experimental/fil/detail/raft_proto/padding.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/raft_proto/padding.hpp
@@ -21,11 +21,12 @@ namespace raft_proto {
 /* Return the value that must be added to val to equal the next multiple of
  * alignment greater than or equal to val */
 template <typename T, typename U>
-HOST DEVICE auto padding_size(T val, U alignment) {
+HOST DEVICE auto padding_size(T val, U alignment)
+{
   auto result = val;
   if (alignment != 0) {
     auto remainder = val % alignment;
-    result = alignment - remainder;
+    result         = alignment - remainder;
     result *= (remainder != 0);
   }
   return result;
@@ -33,25 +34,26 @@ HOST DEVICE auto padding_size(T val, U alignment) {
 
 /* Return the next multiple of alignment >= val */
 template <typename T, typename U>
-HOST DEVICE auto padded_size(T val, U alignment) {
+HOST DEVICE auto padded_size(T val, U alignment)
+{
   return val + padding_size(val, alignment);
 }
 
 /* Return the value that must be added to val to equal the next multiple of
  * alignment less than or equal to val */
 template <typename T, typename U>
-HOST DEVICE auto downpadding_size(T val, U alignment) {
+HOST DEVICE auto downpadding_size(T val, U alignment)
+{
   auto result = val;
-  if (alignment != 0) {
-    result = val % alignment;
-  }
+  if (alignment != 0) { result = val % alignment; }
   return result;
 }
 
 /* Return the next multiple of alignment <= val */
 template <typename T, typename U>
-HOST DEVICE auto downpadded_size(T val, U alignment) {
+HOST DEVICE auto downpadded_size(T val, U alignment)
+{
   return val - downpadding_size(val, alignment);
 }
 
-}
+}  // namespace raft_proto
diff --git a/cpp/include/cuml/experimental/fil/detail/specialization_types.hpp b/cpp/include/cuml/experimental/fil/detail/specialization_types.hpp
index f4f3f0a999..d4f0826e73 100644
--- a/cpp/include/cuml/experimental/fil/detail/specialization_types.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/specialization_types.hpp
@@ -16,9 +16,9 @@
 #pragma once
 #include <cstddef>
 #include <cstdint>
+#include <cuml/experimental/fil/tree_layout.hpp>
 #include <type_traits>
 #include <variant>
-#include <cuml/experimental/fil/tree_layout.hpp>
 namespace ML {
 namespace experimental {
 namespace fil {
@@ -42,23 +42,15 @@ template <tree_layout layout_v, bool double_precision, bool large_trees>
 struct specialization_types {
   /* The node threshold type to be used based on the template parameters
    */
-  using threshold_type = std::conditional_t<
-    double_precision, double, float
-  >;
+  using threshold_type = std::conditional_t<double_precision, double, float>;
   /* The type required for specifying indexes to vector leaf outputs or
    * non-local categorical data.
    */
-  using index_type = std::conditional_t<
-    double_precision, std::uint64_t, std::uint32_t
-  >;
+  using index_type = std::conditional_t<double_precision, std::uint64_t, std::uint32_t>;
   /* The type used to provide metadata storage for nodes */
-  using metadata_type = std::conditional_t<
-    large_trees, std::uint32_t, std::uint16_t
-  >;
+  using metadata_type = std::conditional_t<large_trees, std::uint32_t, std::uint16_t>;
   /* The type used to provide metadata storage for nodes */
-  using offset_type = std::conditional_t<
-    large_trees, std::uint32_t, std::uint16_t
-  >;
+  using offset_type = std::conditional_t<large_trees, std::uint32_t, std::uint16_t>;
   /* The tree layout (alias for layout_v)*/
   auto static constexpr const layout = layout_v;
   /* Whether or not this tree requires double precision (alias for
@@ -74,18 +66,17 @@ struct specialization_types {
 /* A variant holding information on all specialization types compiled
  * in standard cuML FIL
  */
-using specialization_variant = std::variant<
-  specialization_types<tree_layout::depth_first, false, false>,
-  specialization_types<tree_layout::depth_first, false, true>,
-  specialization_types<tree_layout::depth_first, true, false>,
-  specialization_types<tree_layout::depth_first, true, true>,
-  specialization_types<tree_layout::breadth_first, false, false>,
-  specialization_types<tree_layout::breadth_first, false, true>,
-  specialization_types<tree_layout::breadth_first, true, false>,
-  specialization_types<tree_layout::breadth_first, true, true>
->;
+using specialization_variant =
+  std::variant<specialization_types<tree_layout::depth_first, false, false>,
+               specialization_types<tree_layout::depth_first, false, true>,
+               specialization_types<tree_layout::depth_first, true, false>,
+               specialization_types<tree_layout::depth_first, true, true>,
+               specialization_types<tree_layout::breadth_first, false, false>,
+               specialization_types<tree_layout::breadth_first, false, true>,
+               specialization_types<tree_layout::breadth_first, true, false>,
+               specialization_types<tree_layout::breadth_first, true, true>>;
 
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp b/cpp/include/cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp
index 275af98a43..e2cbf6e519 100644
--- a/cpp/include/cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp
@@ -19,7 +19,7 @@
 #include <cuml/experimental/fil/detail/specializations/forest_macros.hpp>
 /* Declare device initialization function for the types specified by the given
  * variant index */
-#define CUML_FIL_INITIALIZE_DEVICE(template_type, variant_index) template_type void initialize_device<\
-  CUML_FIL_FOREST(variant_index),\
-  raft_proto::device_type::gpu\
->(raft_proto::device_id<raft_proto::device_type::gpu>);
+#define CUML_FIL_INITIALIZE_DEVICE(template_type, variant_index)                     \
+  template_type void                                                                 \
+    initialize_device<CUML_FIL_FOREST(variant_index), raft_proto::device_type::gpu>( \
+      raft_proto::device_id<raft_proto::device_type::gpu>);
diff --git a/cpp/include/cuml/experimental/fil/detail/specializations/forest_macros.hpp b/cpp/include/cuml/experimental/fil/detail/specializations/forest_macros.hpp
index 6919e0a935..561b5beb70 100644
--- a/cpp/include/cuml/experimental/fil/detail/specializations/forest_macros.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/specializations/forest_macros.hpp
@@ -14,18 +14,24 @@
  * limitations under the License.
  */
 #pragma once
-#include <variant>
 #include <cuml/experimental/fil/constants.hpp>
-#include <cuml/experimental/fil/detail/specialization_types.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
+#include <cuml/experimental/fil/detail/specialization_types.hpp>
 #include <cuml/experimental/fil/tree_layout.hpp>
+#include <variant>
 
 /* Macro which, given a variant index, will extract the type of the
  * corresponding variant from the specialization_variant type. This allows us
  * to specify all forest variants we wish to support in one location and then
  * reference them by index elsewhere. */
-#define CUML_FIL_SPEC(variant_index) std::variant_alternative_t<variant_index, fil::detail::specialization_variant>
+#define CUML_FIL_SPEC(variant_index) \
+  std::variant_alternative_t<variant_index, fil::detail::specialization_variant>
 
 /* Macro which expands to a full declaration of a forest type corresponding to
  * the given variant index. */
-#define CUML_FIL_FOREST(variant_index) forest< CUML_FIL_SPEC(variant_index)::layout, typename CUML_FIL_SPEC(variant_index)::threshold_type, typename CUML_FIL_SPEC(variant_index)::index_type, typename CUML_FIL_SPEC(variant_index)::metadata_type, typename CUML_FIL_SPEC(variant_index)::offset_type>
+#define CUML_FIL_FOREST(variant_index)                          \
+  forest<CUML_FIL_SPEC(variant_index)::layout,                  \
+         typename CUML_FIL_SPEC(variant_index)::threshold_type, \
+         typename CUML_FIL_SPEC(variant_index)::index_type,     \
+         typename CUML_FIL_SPEC(variant_index)::metadata_type,  \
+         typename CUML_FIL_SPEC(variant_index)::offset_type>
diff --git a/cpp/include/cuml/experimental/fil/detail/specializations/infer_macros.hpp b/cpp/include/cuml/experimental/fil/detail/specializations/infer_macros.hpp
index 3e618cb051..376e5d0d20 100644
--- a/cpp/include/cuml/experimental/fil/detail/specializations/infer_macros.hpp
+++ b/cpp/include/cuml/experimental/fil/detail/specializations/infer_macros.hpp
@@ -15,130 +15,139 @@
  */
 #pragma once
 #include <cstddef>
-#include <variant>
 #include <cuml/experimental/fil/constants.hpp>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/forest.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/postprocessor.hpp>
-#include <cuml/experimental/fil/detail/specialization_types.hpp>
-#include <cuml/experimental/fil/detail/specializations/forest_macros.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_id.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/experimental/fil/detail/specialization_types.hpp>
+#include <cuml/experimental/fil/detail/specializations/forest_macros.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <variant>
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model without vector leaves or non-local categorical data.*/
-#define CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index)(\
-  CUML_FIL_FOREST(variant_index) const&,\
-  postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  index_type,\
-  index_type,\
-  index_type,\
-  std::nullptr_t,\
-  std::nullptr_t,\
-  infer_kind,\
-  std::optional<index_type>,\
-  raft_proto::device_id<dev>,\
-  raft_proto::cuda_stream stream\
-)
+#define CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index)                 \
+  (CUML_FIL_FOREST(variant_index) const&,                              \
+   postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&, \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   index_type,                                                         \
+   index_type,                                                         \
+   index_type,                                                         \
+   std::nullptr_t,                                                     \
+   std::nullptr_t,                                                     \
+   infer_kind,                                                         \
+   std::optional<index_type>,                                          \
+   raft_proto::device_id<dev>,                                         \
+   raft_proto::cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model with vector leaves but without non-local categorical data.*/
-#define CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index)(\
-  CUML_FIL_FOREST(variant_index) const&,\
-  postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  index_type,\
-  index_type,\
-  index_type,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  std::nullptr_t,\
-  infer_kind,\
-  std::optional<index_type>,\
-  raft_proto::device_id<dev>,\
-  raft_proto::cuda_stream stream\
-)
+#define CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index)                 \
+  (CUML_FIL_FOREST(variant_index) const&,                              \
+   postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&, \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   index_type,                                                         \
+   index_type,                                                         \
+   index_type,                                                         \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   std::nullptr_t,                                                     \
+   infer_kind,                                                         \
+   std::optional<index_type>,                                          \
+   raft_proto::device_id<dev>,                                         \
+   raft_proto::cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model without vector leaves but with non-local categorical data.*/
-#define CUML_FIL_SCALAR_NONLOCAL_ARGS(dev, variant_index)(\
-  CUML_FIL_FOREST(variant_index) const&,\
-  postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  index_type,\
-  index_type,\
-  index_type,\
-  std::nullptr_t,\
-  CUML_FIL_SPEC(variant_index)::index_type*,\
-  infer_kind,\
-  std::optional<index_type>,\
-  raft_proto::device_id<dev>,\
-  raft_proto::cuda_stream stream\
-)
+#define CUML_FIL_SCALAR_NONLOCAL_ARGS(dev, variant_index)              \
+  (CUML_FIL_FOREST(variant_index) const&,                              \
+   postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&, \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   index_type,                                                         \
+   index_type,                                                         \
+   index_type,                                                         \
+   std::nullptr_t,                                                     \
+   CUML_FIL_SPEC(variant_index)::index_type*,                          \
+   infer_kind,                                                         \
+   std::optional<index_type>,                                          \
+   raft_proto::device_id<dev>,                                         \
+   raft_proto::cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model with vector leaves and with non-local categorical data.*/
-#define CUML_FIL_VECTOR_NONLOCAL_ARGS(dev, variant_index)(\
-  CUML_FIL_FOREST(variant_index) const&,\
-  postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  index_type,\
-  index_type,\
-  index_type,\
-  CUML_FIL_SPEC(variant_index)::threshold_type*,\
-  CUML_FIL_SPEC(variant_index)::index_type*,\
-  infer_kind,\
-  std::optional<index_type>,\
-  raft_proto::device_id<dev>,\
-  raft_proto::cuda_stream stream\
-)
+#define CUML_FIL_VECTOR_NONLOCAL_ARGS(dev, variant_index)              \
+  (CUML_FIL_FOREST(variant_index) const&,                              \
+   postprocessor<CUML_FIL_SPEC(variant_index)::threshold_type> const&, \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   index_type,                                                         \
+   index_type,                                                         \
+   index_type,                                                         \
+   CUML_FIL_SPEC(variant_index)::threshold_type*,                      \
+   CUML_FIL_SPEC(variant_index)::index_type*,                          \
+   infer_kind,                                                         \
+   std::optional<index_type>,                                          \
+   raft_proto::device_id<dev>,                                         \
+   raft_proto::cuda_stream stream)
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index */
-#define CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, categorical) template_type void infer<\
-  dev, categorical, CUML_FIL_FOREST(variant_index)>
+#define CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, categorical) \
+  template_type void infer<dev, categorical, CUML_FIL_FOREST(variant_index)>
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type without
  * vector leaves or categorical nodes*/
-#define CUML_FIL_INFER_DEV_SCALAR_LEAF_NO_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, false)CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_SCALAR_LEAF_NO_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, false)              \
+  CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type without
  * vector leaves and with only local categorical nodes*/
-#define CUML_FIL_INFER_DEV_SCALAR_LEAF_LOCAL_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_SCALAR_LEAF_LOCAL_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)                  \
+  CUML_FIL_SCALAR_LOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type without
  * vector leaves and with non-local categorical nodes*/
-#define CUML_FIL_INFER_DEV_SCALAR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)CUML_FIL_SCALAR_NONLOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_SCALAR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)                     \
+  CUML_FIL_SCALAR_NONLOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type with
  * vector leaves and without categorical nodes*/
-#define CUML_FIL_INFER_DEV_VECTOR_LEAF_NO_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, false)CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_VECTOR_LEAF_NO_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, false)              \
+  CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type with
  * vector leaves and with only local categorical nodes*/
-#define CUML_FIL_INFER_DEV_VECTOR_LEAF_LOCAL_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_VECTOR_LEAF_LOCAL_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)                  \
+  CUML_FIL_VECTOR_LOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index on the given device type with
  * vector leaves and with non-local categorical nodes*/
-#define CUML_FIL_INFER_DEV_VECTOR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index) CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)CUML_FIL_VECTOR_NONLOCAL_ARGS(dev, variant_index);
+#define CUML_FIL_INFER_DEV_VECTOR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_TEMPLATE(template_type, dev, variant_index, true)                     \
+  CUML_FIL_VECTOR_NONLOCAL_ARGS(dev, variant_index);
 
 /* Macro which expands to the declaration of all valid inference templates for
  * the given device on the forest type specified by the given variant index */
-#define CUML_FIL_INFER_ALL(template_type, dev, variant_index) CUML_FIL_INFER_DEV_SCALAR_LEAF_NO_CAT(template_type, dev, variant_index)\
-  CUML_FIL_INFER_DEV_SCALAR_LEAF_LOCAL_CAT(template_type, dev, variant_index)\
-  CUML_FIL_INFER_DEV_SCALAR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index)\
-  CUML_FIL_INFER_DEV_VECTOR_LEAF_NO_CAT(template_type, dev, variant_index)\
-  CUML_FIL_INFER_DEV_VECTOR_LEAF_LOCAL_CAT(template_type, dev, variant_index)\
+#define CUML_FIL_INFER_ALL(template_type, dev, variant_index)                    \
+  CUML_FIL_INFER_DEV_SCALAR_LEAF_NO_CAT(template_type, dev, variant_index)       \
+  CUML_FIL_INFER_DEV_SCALAR_LEAF_LOCAL_CAT(template_type, dev, variant_index)    \
+  CUML_FIL_INFER_DEV_SCALAR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index) \
+  CUML_FIL_INFER_DEV_VECTOR_LEAF_NO_CAT(template_type, dev, variant_index)       \
+  CUML_FIL_INFER_DEV_VECTOR_LEAF_LOCAL_CAT(template_type, dev, variant_index)    \
   CUML_FIL_INFER_DEV_VECTOR_LEAF_NONLOCAL_CAT(template_type, dev, variant_index)
diff --git a/cpp/include/cuml/experimental/fil/exceptions.hpp b/cpp/include/cuml/experimental/fil/exceptions.hpp
index 8984157ae7..d09d5465aa 100644
--- a/cpp/include/cuml/experimental/fil/exceptions.hpp
+++ b/cpp/include/cuml/experimental/fil/exceptions.hpp
@@ -23,16 +23,11 @@ namespace fil {
 
 /** Exception indicating model is incompatible with experimental FIL */
 struct unusable_model_exception : std::exception {
-  unusable_model_exception () : msg_{"Model is not compatible with experimental FIL"}
-  {
-  }
-  unusable_model_exception (std::string msg) : msg_{msg}
-  {
-  }
-  unusable_model_exception (char const* msg) : msg_{msg}
-  {
-  }
+  unusable_model_exception() : msg_{"Model is not compatible with experimental FIL"} {}
+  unusable_model_exception(std::string msg) : msg_{msg} {}
+  unusable_model_exception(char const* msg) : msg_{msg} {}
   virtual char const* what() const noexcept { return msg_.c_str(); }
+
  private:
   std::string msg_;
 };
@@ -63,6 +58,6 @@ struct type_error : std::exception {
   char const* msg_;
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/forest_model.hpp b/cpp/include/cuml/experimental/fil/forest_model.hpp
index 58e9ecf642..47f3969471 100644
--- a/cpp/include/cuml/experimental/fil/forest_model.hpp
+++ b/cpp/include/cuml/experimental/fil/forest_model.hpp
@@ -15,14 +15,14 @@
  */
 #pragma once
 #include <cstddef>
-#include <type_traits>
-#include <variant>
 #include <cuml/experimental/fil/decision_forest.hpp>
-#include <cuml/experimental/fil/infer_kind.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/gpu_support.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/handle.hpp>
+#include <cuml/experimental/fil/infer_kind.hpp>
+#include <type_traits>
+#include <variant>
 
 namespace ML {
 namespace experimental {
@@ -36,74 +36,77 @@ namespace fil {
  */
 struct forest_model {
   /** Wrap a decision_forest in a full forest_model object */
-  forest_model(
-    decision_forest_variant&& forest = decision_forest_variant{}
-  ) : decision_forest_{forest} {}
+  forest_model(decision_forest_variant&& forest = decision_forest_variant{})
+    : decision_forest_{forest}
+  {
+  }
 
   /** The number of features per row expected by the model */
-  auto num_features() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.num_features();
-    }, decision_forest_);
+  auto num_features()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.num_features(); },
+                      decision_forest_);
   }
 
   /** The number of outputs per row generated by the model */
-  auto num_outputs() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.num_outputs();
-    }, decision_forest_);
+  auto num_outputs()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.num_outputs(); },
+                      decision_forest_);
   }
 
   /** The number of trees in the model */
-  auto num_trees() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.num_trees();
-    }, decision_forest_);
+  auto num_trees()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.num_trees(); },
+                      decision_forest_);
   }
 
   /** Whether or not leaf nodes use vector outputs */
-  auto has_vector_leaves() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.has_vector_leaves();
-    }, decision_forest_);
+  auto has_vector_leaves()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.has_vector_leaves(); },
+                      decision_forest_);
   }
 
   /** The operation used for postprocessing all outputs for a single row */
-  auto row_postprocessing() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.row_postprocessing();
-    }, decision_forest_);
+  auto row_postprocessing()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.row_postprocessing(); },
+                      decision_forest_);
   }
 
   /** The operation used for postprocessing each element of the output for a
    * single row */
-  auto elem_postprocessing() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.elem_postprocessing();
-    }, decision_forest_);
+  auto elem_postprocessing()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.elem_postprocessing(); },
+                      decision_forest_);
   }
 
   /** The type of memory (device/host) where the model is stored */
-  auto memory_type() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.memory_type();
-    }, decision_forest_);
+  auto memory_type()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.memory_type(); },
+                      decision_forest_);
   }
 
   /** The ID of the device on which this model is loaded */
-  auto device_index() {
-    return std::visit([](auto&& concrete_forest) {
-      return concrete_forest.device_index();
-    }, decision_forest_);
+  auto device_index()
+  {
+    return std::visit([](auto&& concrete_forest) { return concrete_forest.device_index(); },
+                      decision_forest_);
   }
 
   /** Whether or not model is loaded at double precision */
-  auto is_double_precision() {
-    return std::visit([](auto&& concrete_forest) {
-      return std::is_same_v<
-        typename std::remove_reference_t<decltype(concrete_forest)>::io_type, double
-      >;
-    }, decision_forest_);
+  auto is_double_precision()
+  {
+    return std::visit(
+      [](auto&& concrete_forest) {
+        return std::is_same_v<typename std::remove_reference_t<decltype(concrete_forest)>::io_type,
+                              double>;
+      },
+      decision_forest_);
   }
 
   /**
@@ -128,20 +131,24 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(
-    raft_proto::buffer<io_t>& output,
-    raft_proto::buffer<io_t> const& input,
-    raft_proto::cuda_stream stream = raft_proto::cuda_stream{},
-    infer_kind predict_type=infer_kind::default_kind,
-    std::optional<index_type> specified_chunk_size=std::nullopt
-  ) {
-    std::visit([this, predict_type, &output, &input, &stream, &specified_chunk_size](auto&& concrete_forest) {
-      if constexpr(std::is_same_v<typename std::remove_reference_t<decltype(concrete_forest)>::io_type, io_t>) {
-        concrete_forest.predict(output, input, stream, predict_type, specified_chunk_size);
-      } else {
-        throw type_error("Input type does not match model_type");
-      }
-    }, decision_forest_);
+  void predict(raft_proto::buffer<io_t>& output,
+               raft_proto::buffer<io_t> const& input,
+               raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{},
+               infer_kind predict_type                        = infer_kind::default_kind,
+               std::optional<index_type> specified_chunk_size = std::nullopt)
+  {
+    std::visit(
+      [this, predict_type, &output, &input, &stream, &specified_chunk_size](
+        auto&& concrete_forest) {
+        if constexpr (std::is_same_v<
+                        typename std::remove_reference_t<decltype(concrete_forest)>::io_type,
+                        io_t>) {
+          concrete_forest.predict(output, input, stream, predict_type, specified_chunk_size);
+        } else {
+          throw type_error("Input type does not match model_type");
+        }
+      },
+      decision_forest_);
   }
 
   /**
@@ -170,94 +177,76 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(
-    raft_proto::handle_t const& handle,
-    raft_proto::buffer<io_t>& output,
-    raft_proto::buffer<io_t> const& input,
-    infer_kind predict_type=infer_kind::default_kind,
-    std::optional<index_type> specified_chunk_size=std::nullopt
-  ) {
-    std::visit([this, predict_type, &handle, &output, &input, &specified_chunk_size](auto&& concrete_forest) {
-      using model_io_t = typename std::remove_reference_t<decltype(concrete_forest)>::io_type;
-      if constexpr(std::is_same_v<model_io_t, io_t>) {
-        if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
-          concrete_forest.predict(
-            output,
-            input,
-            handle.get_next_usable_stream(),
-            predict_type,
-            specified_chunk_size
-          );
-        } else {
-          auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64};
-          auto constexpr static const MAX_CHUNK_SIZE = std::size_t{64};
-
-          auto row_count = input.size() / num_features();
-          auto partition_size = std::max(
-            raft_proto::ceildiv(row_count, handle.get_usable_stream_count()),
-            specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION
-          );
-          auto partition_count = raft_proto::ceildiv(row_count, partition_size);
-          for (auto i = std::size_t{}; i < partition_count; ++i) {
-            auto stream = handle.get_next_usable_stream();
-            auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size);
-            auto partition_in = raft_proto::buffer<io_t>{};
-            if (input.memory_type() != memory_type()) {
-              partition_in = raft_proto::buffer<io_t>{
-                rows_in_this_partition * num_features(),
-                memory_type()
-              };
-              raft_proto::copy<raft_proto::DEBUG_ENABLED>(
-                partition_in,
-                input,
-                0,
-                i * partition_size * num_features(),
-                partition_in.size(),
-                stream
-              );
-            } else {
-              partition_in = raft_proto::buffer<io_t>{
-                input.data() + i * partition_size * num_features(),
-                rows_in_this_partition * num_features(),
-                memory_type()
-              };
-            }
-            auto partition_out = raft_proto::buffer<io_t>{};
-            if (output.memory_type() != memory_type()) {
-              partition_out = raft_proto::buffer<io_t>{
-                rows_in_this_partition * num_outputs(),
-                memory_type()
-              };
-            } else {
-              partition_out = raft_proto::buffer<io_t>{
-                output.data() + i * partition_size * num_outputs(),
-                rows_in_this_partition * num_outputs(),
-                memory_type()
-              };
-            }
+  void predict(raft_proto::handle_t const& handle,
+               raft_proto::buffer<io_t>& output,
+               raft_proto::buffer<io_t> const& input,
+               infer_kind predict_type                        = infer_kind::default_kind,
+               std::optional<index_type> specified_chunk_size = std::nullopt)
+  {
+    std::visit(
+      [this, predict_type, &handle, &output, &input, &specified_chunk_size](
+        auto&& concrete_forest) {
+        using model_io_t = typename std::remove_reference_t<decltype(concrete_forest)>::io_type;
+        if constexpr (std::is_same_v<model_io_t, io_t>) {
+          if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
             concrete_forest.predict(
-              partition_out,
-              partition_in,
-              stream,
-              predict_type,
-              specified_chunk_size
-            );
-            if (output.memory_type() != memory_type()) {
-              raft_proto::copy<raft_proto::DEBUG_ENABLED>(
-                output,
-                partition_out,
-                i * partition_size * num_outputs(),
-                0,
-                partition_out.size(),
-                stream
-              );
+              output, input, handle.get_next_usable_stream(), predict_type, specified_chunk_size);
+          } else {
+            auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64};
+            auto constexpr static const MAX_CHUNK_SIZE           = std::size_t{64};
+
+            auto row_count = input.size() / num_features();
+            auto partition_size =
+              std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()),
+                       specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
+            auto partition_count = raft_proto::ceildiv(row_count, partition_size);
+            for (auto i = std::size_t{}; i < partition_count; ++i) {
+              auto stream = handle.get_next_usable_stream();
+              auto rows_in_this_partition =
+                std::min(partition_size, row_count - i * partition_size);
+              auto partition_in = raft_proto::buffer<io_t>{};
+              if (input.memory_type() != memory_type()) {
+                partition_in =
+                  raft_proto::buffer<io_t>{rows_in_this_partition * num_features(), memory_type()};
+                raft_proto::copy<raft_proto::DEBUG_ENABLED>(partition_in,
+                                                            input,
+                                                            0,
+                                                            i * partition_size * num_features(),
+                                                            partition_in.size(),
+                                                            stream);
+              } else {
+                partition_in =
+                  raft_proto::buffer<io_t>{input.data() + i * partition_size * num_features(),
+                                           rows_in_this_partition * num_features(),
+                                           memory_type()};
+              }
+              auto partition_out = raft_proto::buffer<io_t>{};
+              if (output.memory_type() != memory_type()) {
+                partition_out =
+                  raft_proto::buffer<io_t>{rows_in_this_partition * num_outputs(), memory_type()};
+              } else {
+                partition_out =
+                  raft_proto::buffer<io_t>{output.data() + i * partition_size * num_outputs(),
+                                           rows_in_this_partition * num_outputs(),
+                                           memory_type()};
+              }
+              concrete_forest.predict(
+                partition_out, partition_in, stream, predict_type, specified_chunk_size);
+              if (output.memory_type() != memory_type()) {
+                raft_proto::copy<raft_proto::DEBUG_ENABLED>(output,
+                                                            partition_out,
+                                                            i * partition_size * num_outputs(),
+                                                            0,
+                                                            partition_out.size(),
+                                                            stream);
+              }
             }
           }
+        } else {
+          throw type_error("Input type does not match model_type");
         }
-      } else {
-        throw type_error("Input type does not match model_type");
-      }
-    }, decision_forest_);
+      },
+      decision_forest_);
   }
 
   /**
@@ -285,27 +274,18 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(
-    raft_proto::handle_t const& handle,
-    io_t* output,
-    io_t* input,
-    std::size_t num_rows,
-    raft_proto::device_type out_mem_type,
-    raft_proto::device_type in_mem_type,
-    infer_kind predict_type=infer_kind::default_kind,
-    std::optional<index_type> specified_chunk_size=std::nullopt
-  ) {
+  void predict(raft_proto::handle_t const& handle,
+               io_t* output,
+               io_t* input,
+               std::size_t num_rows,
+               raft_proto::device_type out_mem_type,
+               raft_proto::device_type in_mem_type,
+               infer_kind predict_type                        = infer_kind::default_kind,
+               std::optional<index_type> specified_chunk_size = std::nullopt)
+  {
     // TODO(wphicks): Make sure buffer lands on same device as model
-    auto out_buffer = raft_proto::buffer{
-      output,
-      num_rows * num_outputs(),
-      out_mem_type
-    };
-    auto in_buffer = raft_proto::buffer{
-      input,
-      num_rows * num_features(),
-      in_mem_type
-    };
+    auto out_buffer = raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type};
+    auto in_buffer  = raft_proto::buffer{input, num_rows * num_features(), in_mem_type};
     predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
@@ -313,6 +293,6 @@ struct forest_model {
   decision_forest_variant decision_forest_;
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/infer_kind.hpp b/cpp/include/cuml/experimental/fil/infer_kind.hpp
index a2b1a57827..59599f6741 100644
--- a/cpp/include/cuml/experimental/fil/infer_kind.hpp
+++ b/cpp/include/cuml/experimental/fil/infer_kind.hpp
@@ -17,11 +17,7 @@
 namespace ML {
 namespace experimental {
 namespace fil {
-enum class infer_kind : unsigned char {
-  default_kind = 0,
-  per_tree = 1,
-  leaf_id = 2
-};
-}
-}
+enum class infer_kind : unsigned char { default_kind = 0, per_tree = 1, leaf_id = 2 };
 }
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/postproc_ops.hpp b/cpp/include/cuml/experimental/fil/postproc_ops.hpp
index cdd51d83b1..c7b70c549c 100644
--- a/cpp/include/cuml/experimental/fil/postproc_ops.hpp
+++ b/cpp/include/cuml/experimental/fil/postproc_ops.hpp
@@ -20,21 +20,21 @@ namespace fil {
 
 /** Enum representing possible row-wise operations on output */
 enum struct row_op : unsigned char {
-  disable=0b00100000,
-  softmax=0b01000000,
-  max_index=0b10000000
+  disable   = 0b00100000,
+  softmax   = 0b01000000,
+  max_index = 0b10000000
 };
 
 /** Enum representing possible element-wise operations on output */
 enum struct element_op : unsigned char {
-  disable=0b00000000,
-  signed_square=0b00000001,
-  hinge=0b00000010,
-  sigmoid=0b00000100,
-  exponential=0b00001000,
-  logarithm_one_plus_exp=0b00010000
+  disable                = 0b00000000,
+  signed_square          = 0b00000001,
+  hinge                  = 0b00000010,
+  sigmoid                = 0b00000100,
+  exponential            = 0b00001000,
+  logarithm_one_plus_exp = 0b00010000
 };
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/tree_layout.hpp b/cpp/include/cuml/experimental/fil/tree_layout.hpp
index 7edf1acab3..52888efcc8 100644
--- a/cpp/include/cuml/experimental/fil/tree_layout.hpp
+++ b/cpp/include/cuml/experimental/fil/tree_layout.hpp
@@ -17,10 +17,7 @@
 namespace ML {
 namespace experimental {
 namespace fil {
-enum class tree_layout : unsigned char {
-  depth_first=0,
-  breadth_first=1
-};
-}
-}
+enum class tree_layout : unsigned char { depth_first = 0, breadth_first = 1 };
 }
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/experimental/fil/treelite_importer.hpp b/cpp/include/cuml/experimental/fil/treelite_importer.hpp
index fb4a0e74a9..23284a36d6 100644
--- a/cpp/include/cuml/experimental/fil/treelite_importer.hpp
+++ b/cpp/include/cuml/experimental/fil/treelite_importer.hpp
@@ -16,11 +16,6 @@
 #pragma once
 #include <cmath>
 #include <cstddef>
-#include <queue>
-#include <stack>
-#include <treelite/c_api.h>
-#include <treelite/tree.h>
-#include <treelite/typeinfo.h>
 #include <cuml/experimental/fil/constants.hpp>
 #include <cuml/experimental/fil/decision_forest.hpp>
 #include <cuml/experimental/fil/detail/decision_forest_builder.hpp>
@@ -29,6 +24,11 @@
 #include <cuml/experimental/fil/forest_model.hpp>
 #include <cuml/experimental/fil/postproc_ops.hpp>
 #include <cuml/experimental/fil/tree_layout.hpp>
+#include <queue>
+#include <stack>
+#include <treelite/c_api.h>
+#include <treelite/tree.h>
+#include <treelite/typeinfo.h>
 
 namespace ML {
 namespace experimental {
@@ -39,15 +39,11 @@ namespace detail {
  */
 template <tree_layout layout, typename T>
 struct traversal_container {
-  using backing_container_t = std::conditional_t<
-    layout == tree_layout::depth_first,
-    std::stack<T>,
-    std::queue<T>
-  >;
-  void add(T const& val) {
-    data_.push(val);
-  }
-  void add(T const& hot, T const& distant) {
+  using backing_container_t =
+    std::conditional_t<layout == tree_layout::depth_first, std::stack<T>, std::queue<T>>;
+  void add(T const& val) { data_.push(val); }
+  void add(T const& hot, T const& distant)
+  {
     if constexpr (layout == tree_layout::depth_first) {
       data_.push(distant);
       data_.push(hot);
@@ -56,7 +52,8 @@ struct traversal_container {
       data_.push(distant);
     }
   }
-  auto next() {
+  auto next()
+  {
     if constexpr (std::is_same_v<backing_container_t, std::stack<T>>) {
       auto result = data_.top();
       data_.pop();
@@ -67,49 +64,46 @@ struct traversal_container {
       return result;
     }
   }
-  auto peek() {
+  auto peek()
+  {
     if constexpr (std::is_same_v<backing_container_t, std::stack<T>>) {
       return data_.top();
     } else {
       return data_.front();
     }
   }
-  [[nodiscard]] auto empty() {
-    return data_.empty();
-  }
-  auto size() {
-    return data_.size();
-  }
+  [[nodiscard]] auto empty() { return data_.empty(); }
+  auto size() { return data_.size(); }
+
  private:
   backing_container_t data_;
 };
 
-  struct postproc_params_t {
-    element_op element = element_op::disable;
-    row_op row = row_op::disable;
-    double constant = 1.0;
-  };
-} // namespace detail
+struct postproc_params_t {
+  element_op element = element_op::disable;
+  row_op row         = row_op::disable;
+  double constant    = 1.0;
+};
+}  // namespace detail
 
 /**
  * Struct used to import a model from Treelite to FIL
  *
  * @tparam layout The in-memory layout for nodes to be loaded into FIL
  */
-template<tree_layout layout>
+template <tree_layout layout>
 struct treelite_importer {
-  template<typename tl_threshold_t, typename tl_output_t>
+  template <typename tl_threshold_t, typename tl_output_t>
   struct treelite_node {
     treelite::Tree<tl_threshold_t, tl_output_t> const& tree;
     int node_id;
     index_type parent_index;
     index_type own_index;
 
-    auto is_leaf() {
-      return tree.IsLeaf(node_id);
-    }
+    auto is_leaf() { return tree.IsLeaf(node_id); }
 
-    auto get_output() {
+    auto get_output()
+    {
       auto result = std::vector<tl_output_t>{};
       if (tree.HasLeafVector(node_id)) {
         result = tree.LeafVector(node_id);
@@ -119,20 +113,18 @@ struct treelite_importer {
       return result;
     }
 
-    auto get_categories() {
-      return tree.MatchingCategories(node_id);
-    }
+    auto get_categories() { return tree.MatchingCategories(node_id); }
 
-    auto get_feature() {
-      return tree.SplitIndex(node_id);
-    }
+    auto get_feature() { return tree.SplitIndex(node_id); }
 
-    auto is_categorical() {
+    auto is_categorical()
+    {
       return tree.SplitType(node_id) == treelite::SplitFeatureType::kCategorical;
     }
 
-    auto default_distant() {
-      auto result = false;
+    auto default_distant()
+    {
+      auto result        = false;
       auto default_child = tree.DefaultChild(node_id);
       if (is_categorical()) {
         if (tree.CategoriesListRightChild(node_id)) {
@@ -151,51 +143,50 @@ struct treelite_importer {
       return result;
     }
 
-    auto threshold() {
-      return tree.Threshold(node_id);
-    }
+    auto threshold() { return tree.Threshold(node_id); }
 
-    auto categories() {
+    auto categories()
+    {
       auto result = decltype(tree.MatchingCategories(node_id)){};
-      if (is_categorical()) {
-        result = tree.MatchingCategories(node_id);
-      }
+      if (is_categorical()) { result = tree.MatchingCategories(node_id); }
       return result;
     }
 
-    auto is_inclusive() {
+    auto is_inclusive()
+    {
       auto tl_operator = tree.ComparisonOp(node_id);
       return tl_operator == treelite::Operator::kGT || tl_operator == treelite::Operator::kLE;
     }
   };
 
-  template<typename tl_threshold_t, typename tl_output_t, typename lambda_t>
-  void node_for_each(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree, lambda_t&& lambda) {
+  template <typename tl_threshold_t, typename tl_output_t, typename lambda_t>
+  void node_for_each(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree, lambda_t&& lambda)
+  {
     using node_index_t = decltype(tl_tree.LeftChild(0));
     auto to_be_visited = detail::traversal_container<layout, node_index_t>{};
     to_be_visited.add(node_index_t{});
 
     auto parent_indices = detail::traversal_container<layout, index_type>{};
-    auto cur_index = index_type{};
+    auto cur_index      = index_type{};
     parent_indices.add(cur_index);
 
     while (!to_be_visited.empty()) {
-      auto node_id = to_be_visited.next();
+      auto node_id        = to_be_visited.next();
       auto remaining_size = to_be_visited.size();
 
       auto tl_node = treelite_node<tl_threshold_t, tl_output_t>{
-        tl_tree, node_id, parent_indices.next(), cur_index
-      };
+        tl_tree, node_id, parent_indices.next(), cur_index};
       lambda(tl_node);
 
       if (!tl_tree.IsLeaf(node_id)) {
-        auto tl_left_id = tl_tree.LeftChild(node_id);
+        auto tl_left_id  = tl_tree.LeftChild(node_id);
         auto tl_right_id = tl_tree.RightChild(node_id);
         auto tl_operator = tl_tree.ComparisonOp(node_id);
         if (!tl_node.is_categorical()) {
           if (tl_operator == treelite::Operator::kLT || tl_operator == treelite::Operator::kLE) {
             to_be_visited.add(tl_right_id, tl_left_id);
-          } else if (tl_operator == treelite::Operator::kGT || tl_operator == treelite::Operator::kGE) {
+          } else if (tl_operator == treelite::Operator::kGT ||
+                     tl_operator == treelite::Operator::kGE) {
             to_be_visited.add(tl_left_id, tl_right_id);
           } else {
             throw model_import_error("Unrecognized Treelite operator");
@@ -213,41 +204,42 @@ struct treelite_importer {
     }
   }
 
-  template<typename tl_threshold_t, typename tl_output_t, typename iter_t, typename lambda_t>
-  void node_transform(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree, iter_t output_iter, lambda_t&& lambda) {
-    node_for_each(
-      tl_tree,
-      [&output_iter, &lambda](auto&& tl_node) {
-        *output_iter = lambda(tl_node);
-        ++output_iter;
-      }
-    );
+  template <typename tl_threshold_t, typename tl_output_t, typename iter_t, typename lambda_t>
+  void node_transform(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree,
+                      iter_t output_iter,
+                      lambda_t&& lambda)
+  {
+    node_for_each(tl_tree, [&output_iter, &lambda](auto&& tl_node) {
+      *output_iter = lambda(tl_node);
+      ++output_iter;
+    });
   }
 
-  template<typename tl_threshold_t, typename tl_output_t, typename T, typename lambda_t>
-  auto node_accumulate(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree, T init, lambda_t&& lambda) {
+  template <typename tl_threshold_t, typename tl_output_t, typename T, typename lambda_t>
+  auto node_accumulate(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree,
+                       T init,
+                       lambda_t&& lambda)
+  {
     auto result = init;
-    node_for_each(
-      tl_tree,
-      [&result, &lambda](auto&& tl_node) {
-        result = lambda(result, tl_node);
-      }
-    );
+    node_for_each(tl_tree,
+                  [&result, &lambda](auto&& tl_node) { result = lambda(result, tl_node); });
     return result;
   }
 
-  template<typename tl_threshold_t, typename tl_output_t>
-  auto get_nodes(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree) {
+  template <typename tl_threshold_t, typename tl_output_t>
+  auto get_nodes(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree)
+  {
     auto result = std::vector<treelite_node<tl_threshold_t, tl_output_t>>{};
     result.reserve(tl_tree.num_nodes);
     node_transform(tl_tree, std::back_inserter(result), [](auto&& node) { return node; });
     return result;
   }
 
-  template<typename tl_threshold_t, typename tl_output_t>
-  auto get_offsets(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree) {
+  template <typename tl_threshold_t, typename tl_output_t>
+  auto get_offsets(treelite::Tree<tl_threshold_t, tl_output_t> const& tl_tree)
+  {
     auto result = std::vector<index_type>(tl_tree.num_nodes);
-    auto nodes = get_nodes(tl_tree);
+    auto nodes  = get_nodes(tl_tree);
     for (auto i = index_type{}; i < nodes.size(); ++i) {
       // Current index should always be greater than or equal to parent index.
       // Later children will overwrite values set by earlier children, ensuring
@@ -258,85 +250,76 @@ struct treelite_importer {
     return result;
   }
 
-  template<typename lambda_t>
-  void tree_for_each(treelite::Model const& tl_model, lambda_t&& lambda) {
+  template <typename lambda_t>
+  void tree_for_each(treelite::Model const& tl_model, lambda_t&& lambda)
+  {
     tl_model.Dispatch([&lambda](auto&& concrete_tl_model) {
-      std::for_each(
-        std::begin(concrete_tl_model.trees),
-        std::end(concrete_tl_model.trees),
-        lambda
-      );
+      std::for_each(std::begin(concrete_tl_model.trees), std::end(concrete_tl_model.trees), lambda);
     });
   }
 
-  template<typename iter_t, typename lambda_t>
-  void tree_transform(treelite::Model const& tl_model, iter_t output_iter, lambda_t&& lambda) {
+  template <typename iter_t, typename lambda_t>
+  void tree_transform(treelite::Model const& tl_model, iter_t output_iter, lambda_t&& lambda)
+  {
     tl_model.Dispatch([&output_iter, &lambda](auto&& concrete_tl_model) {
-      std::transform(
-        std::begin(concrete_tl_model.trees),
-        std::end(concrete_tl_model.trees),
-        output_iter,
-        lambda
-      );
+      std::transform(std::begin(concrete_tl_model.trees),
+                     std::end(concrete_tl_model.trees),
+                     output_iter,
+                     lambda);
     });
   }
 
-  template<typename T, typename lambda_t>
-  auto tree_accumulate(treelite::Model const& tl_model, T init, lambda_t&& lambda) {
+  template <typename T, typename lambda_t>
+  auto tree_accumulate(treelite::Model const& tl_model, T init, lambda_t&& lambda)
+  {
     auto result = init;
-    tree_for_each(
-      tl_model,
-      [&result, &lambda](auto&& tree) {
-        result = lambda(result, tree);
-      }
-    );
+    tree_for_each(tl_model, [&result, &lambda](auto&& tree) { result = lambda(result, tree); });
     return result;
   }
 
-  auto num_trees(treelite::Model const& tl_model) {
+  auto num_trees(treelite::Model const& tl_model)
+  {
     auto result = index_type{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      result = concrete_tl_model.trees.size();
-    });
+    tl_model.Dispatch(
+      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.trees.size(); });
     return result;
   }
 
-  auto get_offsets(treelite::Model const& tl_model) {
+  auto get_offsets(treelite::Model const& tl_model)
+  {
     auto result = std::vector<std::vector<index_type>>{};
     result.reserve(num_trees(tl_model));
-    tree_transform(tl_model, std::back_inserter(result), [this](auto&&tree) {
-      return get_offsets(tree);
-    });
+    tree_transform(
+      tl_model, std::back_inserter(result), [this](auto&& tree) { return get_offsets(tree); });
     return result;
   }
 
-  auto get_tree_sizes(treelite::Model const& tl_model) {
+  auto get_tree_sizes(treelite::Model const& tl_model)
+  {
     auto result = std::vector<index_type>{};
     tree_transform(
-      tl_model,
-      std::back_inserter(result),
-      [](auto&& tree) { return tree.num_nodes; }
-    );
+      tl_model, std::back_inserter(result), [](auto&& tree) { return tree.num_nodes; });
     return result;
   }
 
-  auto get_num_class(treelite::Model const& tl_model) {
+  auto get_num_class(treelite::Model const& tl_model)
+  {
     auto result = index_type{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      result = concrete_tl_model.task_param.num_class;
-    });
+    tl_model.Dispatch(
+      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.task_param.num_class; });
     return result;
   }
 
-  auto get_num_feature(treelite::Model const& tl_model) {
+  auto get_num_feature(treelite::Model const& tl_model)
+  {
     auto result = index_type{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      result = concrete_tl_model.num_feature;
-    });
+    tl_model.Dispatch(
+      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.num_feature; });
     return result;
   }
 
-  auto get_max_num_categories(treelite::Model const& tl_model) {
+  auto get_max_num_categories(treelite::Model const& tl_model)
+  {
     return tree_accumulate(tl_model, index_type{}, [this](auto&& accum, auto&& tree) {
       return node_accumulate(tree, accum, [](auto&& cur_accum, auto&& tl_node) {
         auto result = cur_accum;
@@ -348,7 +331,8 @@ struct treelite_importer {
     });
   }
 
-  auto get_num_categorical_nodes(treelite::Model const& tl_model) {
+  auto get_num_categorical_nodes(treelite::Model const& tl_model)
+  {
     return tree_accumulate(tl_model, index_type{}, [this](auto&& accum, auto&& tree) {
       return node_accumulate(tree, accum, [](auto&& cur_accum, auto&& tl_node) {
         return cur_accum + tl_node.is_categorical();
@@ -356,7 +340,8 @@ struct treelite_importer {
     });
   }
 
-  auto get_num_leaf_vector_nodes(treelite::Model const& tl_model) {
+  auto get_num_leaf_vector_nodes(treelite::Model const& tl_model)
+  {
     return tree_accumulate(tl_model, index_type{}, [this](auto&& accum, auto&& tree) {
       return node_accumulate(tree, accum, [](auto&& cur_accum, auto&& tl_node) {
         return cur_accum + (tl_node.is_leaf() && tl_node.get_output().size() > 1);
@@ -364,7 +349,8 @@ struct treelite_importer {
     });
   }
 
-  auto get_average_factor(treelite::Model const& tl_model) {
+  auto get_average_factor(treelite::Model const& tl_model)
+  {
     auto result = double{};
     tl_model.Dispatch([&result](auto&& concrete_tl_model) {
       if (concrete_tl_model.average_tree_output) {
@@ -380,35 +366,35 @@ struct treelite_importer {
     return result;
   }
 
-  auto get_bias(treelite::Model const& tl_model) {
+  auto get_bias(treelite::Model const& tl_model)
+  {
     auto result = double{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      result = concrete_tl_model.param.global_bias;
-    });
+    tl_model.Dispatch(
+      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.param.global_bias; });
     return result;
   }
 
-  auto get_postproc_params(treelite::Model const& tl_model) {
+  auto get_postproc_params(treelite::Model const& tl_model)
+  {
     auto result = detail::postproc_params_t{};
     tl_model.Dispatch([&result](auto&& concrete_tl_model) {
       auto tl_pred_transform = std::string{concrete_tl_model.param.pred_transform};
-      if (
-          tl_pred_transform == std::string{"identity"} ||
+      if (tl_pred_transform == std::string{"identity"} ||
           tl_pred_transform == std::string{"identity_multiclass"}) {
         result.element = element_op::disable;
-        result.row = row_op::disable;
+        result.row     = row_op::disable;
       } else if (tl_pred_transform == std::string{"signed_square"}) {
         result.element = element_op::signed_square;
       } else if (tl_pred_transform == std::string{"hinge"}) {
         result.element = element_op::hinge;
       } else if (tl_pred_transform == std::string{"sigmoid"}) {
         result.constant = concrete_tl_model.param.sigmoid_alpha;
-        result.element = element_op::sigmoid;
+        result.element  = element_op::sigmoid;
       } else if (tl_pred_transform == std::string{"exponential"}) {
         result.element = element_op::exponential;
       } else if (tl_pred_transform == std::string{"exponential_standard_ratio"}) {
         result.constant = -concrete_tl_model.param.ratio_c / std::log(2);
-        result.element = element_op::exponential;
+        result.element  = element_op::exponential;
       } else if (tl_pred_transform == std::string{"logarithm_one_plus_exp"}) {
         result.element = element_op::logarithm_one_plus_exp;
       } else if (tl_pred_transform == std::string{"max_index"}) {
@@ -417,7 +403,7 @@ struct treelite_importer {
         result.row = row_op::softmax;
       } else if (tl_pred_transform == std::string{"multiclass_ova"}) {
         result.constant = concrete_tl_model.param.sigmoid_alpha;
-        result.element = element_op::sigmoid;
+        result.element  = element_op::sigmoid;
       } else {
         throw model_import_error{"Unrecognized Treelite pred_transform string"};
       }
@@ -425,53 +411,37 @@ struct treelite_importer {
     return result;
   }
 
-  auto uses_double_thresholds(treelite::Model const& tl_model) {
+  auto uses_double_thresholds(treelite::Model const& tl_model)
+  {
     auto result = false;
     switch (tl_model.GetThresholdType()) {
-      case treelite::TypeInfo::kFloat64:
-        result = true;
-        break;
-      case treelite::TypeInfo::kFloat32:
-        result = false;
-        break;
-      default:
-        throw model_import_error("Unrecognized Treelite threshold type");
+      case treelite::TypeInfo::kFloat64: result = true; break;
+      case treelite::TypeInfo::kFloat32: result = false; break;
+      default: throw model_import_error("Unrecognized Treelite threshold type");
     }
     return result;
   }
 
-  auto uses_double_outputs(treelite::Model const& tl_model) {
+  auto uses_double_outputs(treelite::Model const& tl_model)
+  {
     auto result = false;
     switch (tl_model.GetThresholdType()) {
-      case treelite::TypeInfo::kFloat64:
-        result = true;
-        break;
-      case treelite::TypeInfo::kFloat32:
-        result = false;
-        break;
-      case treelite::TypeInfo::kUInt32:
-        result = false;
-        break;
-      default:
-        throw model_import_error("Unrecognized Treelite threshold type");
+      case treelite::TypeInfo::kFloat64: result = true; break;
+      case treelite::TypeInfo::kFloat32: result = false; break;
+      case treelite::TypeInfo::kUInt32: result = false; break;
+      default: throw model_import_error("Unrecognized Treelite threshold type");
     }
     return result;
   }
 
-  auto uses_integer_outputs(treelite::Model const& tl_model) {
+  auto uses_integer_outputs(treelite::Model const& tl_model)
+  {
     auto result = false;
     switch (tl_model.GetThresholdType()) {
-      case treelite::TypeInfo::kFloat64:
-        result = false;
-        break;
-      case treelite::TypeInfo::kFloat32:
-        result = false;
-        break;
-      case treelite::TypeInfo::kUInt32:
-        result = true;
-        break;
-      default:
-        throw model_import_error("Unrecognized Treelite threshold type");
+      case treelite::TypeInfo::kFloat64: result = false; break;
+      case treelite::TypeInfo::kFloat32: result = false; break;
+      case treelite::TypeInfo::kUInt32: result = true; break;
+      default: throw model_import_error("Unrecognized Treelite threshold type");
     }
     return result;
   }
@@ -480,24 +450,24 @@ struct treelite_importer {
    * Assuming that the correct decision_forest variant has been
    * identified, import to that variant
    */
-  template<index_type variant_index>
-  auto import_to_specific_variant(
-    index_type target_variant_index,
-    treelite::Model const& tl_model,
-    index_type num_class,
-    index_type num_feature,
-    index_type max_num_categories,
-    std::vector<std::vector<index_type>> const& offsets,
-    index_type align_bytes = index_type{},
-    raft_proto::device_type mem_type=raft_proto::device_type::cpu,
-    int device=0,
-    raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-  ) {
+  template <index_type variant_index>
+  auto import_to_specific_variant(index_type target_variant_index,
+                                  treelite::Model const& tl_model,
+                                  index_type num_class,
+                                  index_type num_feature,
+                                  index_type max_num_categories,
+                                  std::vector<std::vector<index_type>> const& offsets,
+                                  index_type align_bytes           = index_type{},
+                                  raft_proto::device_type mem_type = raft_proto::device_type::cpu,
+                                  int device                       = 0,
+                                  raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+  {
     auto result = decision_forest_variant{};
     if constexpr (variant_index != std::variant_size_v<decision_forest_variant>) {
       if (variant_index == target_variant_index) {
         using forest_model_t = std::variant_alternative_t<variant_index, decision_forest_variant>;
-        auto builder = detail::decision_forest_builder<forest_model_t>(max_num_categories, align_bytes);
+        auto builder =
+          detail::decision_forest_builder<forest_model_t>(max_num_categories, align_bytes);
         auto tree_count = num_trees(tl_model);
         auto tree_index = index_type{};
         tree_for_each(tl_model, [this, &builder, &tree_index, &offsets](auto&& tree) {
@@ -508,36 +478,26 @@ struct treelite_importer {
               auto output = node.get_output();
               builder.set_output_size(output.size());
               if (output.size() > index_type{1}) {
-                builder.add_leaf_vector_node(
-                  std::begin(output),
-                  std::end(output)
-                );
+                builder.add_leaf_vector_node(std::begin(output), std::end(output));
               } else {
-                builder.add_node(
-                  typename forest_model_t::io_type(output[0]),
-                  true
-                );
+                builder.add_node(typename forest_model_t::io_type(output[0]), true);
               }
             } else {
               if (node.is_categorical()) {
                 auto categories = node.get_categories();
-                builder.add_categorical_node(
-                  std::begin(categories),
-                  std::end(categories),
-                  node.default_distant(),
-                  node.get_feature(),
-                  offsets[tree_index][node_index]
-                );
+                builder.add_categorical_node(std::begin(categories),
+                                             std::end(categories),
+                                             node.default_distant(),
+                                             node.get_feature(),
+                                             offsets[tree_index][node_index]);
               } else {
-                builder.add_node(
-                  typename forest_model_t::threshold_type(node.threshold()),
-                  false,
-                  node.default_distant(),
-                  false,
-                  node.get_feature(),
-                  offsets[tree_index][node_index],
-                  node.is_inclusive()
-                );
+                builder.add_node(typename forest_model_t::threshold_type(node.threshold()),
+                                 false,
+                                 node.default_distant(),
+                                 false,
+                                 node.get_feature(),
+                                 offsets[tree_index][node_index],
+                                 node.is_inclusive());
               }
             }
             ++node_index;
@@ -552,20 +512,19 @@ struct treelite_importer {
         builder.set_row_postproc(postproc_params.row);
         builder.set_postproc_constant(postproc_params.constant);
 
-        result.template emplace<variant_index>(builder.get_decision_forest(num_feature, num_class, mem_type, device, stream));
+        result.template emplace<variant_index>(
+          builder.get_decision_forest(num_feature, num_class, mem_type, device, stream));
       } else {
-        result = import_to_specific_variant<variant_index + 1>(
-          target_variant_index,
-          tl_model,
-          num_class,
-          num_feature,
-          max_num_categories,
-          offsets,
-          align_bytes,
-          mem_type,
-          device,
-          stream
-        );
+        result = import_to_specific_variant<variant_index + 1>(target_variant_index,
+                                                               tl_model,
+                                                               num_class,
+                                                               num_feature,
+                                                               max_num_categories,
+                                                               offsets,
+                                                               align_bytes,
+                                                               mem_type,
+                                                               device,
+                                                               stream);
       }
     }
     return result;
@@ -593,62 +552,53 @@ struct treelite_importer {
    * @param stream The CUDA stream to use for loading this model (can be
    * omitted for CPU).
    */
-  auto import(
-    treelite::Model const& tl_model,
-    index_type align_bytes = index_type{},
-    std::optional<bool> use_double_precision = std::nullopt,
-    raft_proto::device_type dev_type=raft_proto::device_type::cpu,
-    int device=0,
-    raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-  ) {
-    auto result = decision_forest_variant{};
-    auto num_feature = get_num_feature(tl_model);
-    auto max_num_categories = get_max_num_categories(tl_model);
+  auto import(treelite::Model const& tl_model,
+              index_type align_bytes                   = index_type{},
+              std::optional<bool> use_double_precision = std::nullopt,
+              raft_proto::device_type dev_type         = raft_proto::device_type::cpu,
+              int device                               = 0,
+              raft_proto::cuda_stream stream           = raft_proto::cuda_stream{})
+  {
+    auto result                = decision_forest_variant{};
+    auto num_feature           = get_num_feature(tl_model);
+    auto max_num_categories    = get_max_num_categories(tl_model);
     auto num_categorical_nodes = get_num_categorical_nodes(tl_model);
     auto num_leaf_vector_nodes = get_num_leaf_vector_nodes(tl_model);
     auto use_double_thresholds = use_double_precision.value_or(uses_double_thresholds(tl_model));
 
-    auto offsets = get_offsets(tl_model);
+    auto offsets    = get_offsets(tl_model);
     auto max_offset = std::accumulate(
       std::begin(offsets),
       std::end(offsets),
       index_type{},
       [&offsets](auto&& cur_max, auto&& tree_offsets) {
-        return std::max(cur_max, *std::max_element(std::begin(tree_offsets), std::end(tree_offsets)));
-      }
-    );
+        return std::max(cur_max,
+                        *std::max_element(std::begin(tree_offsets), std::end(tree_offsets)));
+      });
     auto tree_sizes = std::vector<index_type>{};
-    std::transform(
-      std::begin(offsets),
-      std::end(offsets),
-      std::back_inserter(tree_sizes),
-      [](auto&& tree_offsets) {
-        return tree_offsets.size();
-      }
-    );
-
-    auto variant_index = get_forest_variant_index(
-      use_double_thresholds,
-      max_offset,
-      num_feature,
-      num_categorical_nodes,
-      max_num_categories,
-      num_leaf_vector_nodes,
-      layout
-    );
-    auto num_class = get_num_class(tl_model);
-    return forest_model{import_to_specific_variant<index_type{}>(
-      variant_index,
-      tl_model,
-      num_class,
-      num_feature,
-      max_num_categories,
-      offsets,
-      align_bytes,
-      dev_type,
-      device,
-      stream
-    )};
+    std::transform(std::begin(offsets),
+                   std::end(offsets),
+                   std::back_inserter(tree_sizes),
+                   [](auto&& tree_offsets) { return tree_offsets.size(); });
+
+    auto variant_index = get_forest_variant_index(use_double_thresholds,
+                                                  max_offset,
+                                                  num_feature,
+                                                  num_categorical_nodes,
+                                                  max_num_categories,
+                                                  num_leaf_vector_nodes,
+                                                  layout);
+    auto num_class     = get_num_class(tl_model);
+    return forest_model{import_to_specific_variant<index_type{}>(variant_index,
+                                                                 tl_model,
+                                                                 num_class,
+                                                                 num_feature,
+                                                                 max_num_categories,
+                                                                 offsets,
+                                                                 align_bytes,
+                                                                 dev_type,
+                                                                 device,
+                                                                 stream)};
   }
 };
 
@@ -675,36 +625,23 @@ struct treelite_importer {
  * @param stream The CUDA stream to use for loading this model (can be
  * omitted for CPU).
  */
-auto import_from_treelite_model(
-  treelite::Model const& tl_model,
-  tree_layout layout=preferred_tree_layout,
-  index_type align_bytes = index_type{},
-  std::optional<bool> use_double_precision = std::nullopt,
-  raft_proto::device_type dev_type=raft_proto::device_type::cpu,
-  int device=0,
-  raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-) {
+auto import_from_treelite_model(treelite::Model const& tl_model,
+                                tree_layout layout                       = preferred_tree_layout,
+                                index_type align_bytes                   = index_type{},
+                                std::optional<bool> use_double_precision = std::nullopt,
+                                raft_proto::device_type dev_type = raft_proto::device_type::cpu,
+                                int device                       = 0,
+                                raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+{
   auto result = forest_model{};
-  switch(layout) {
+  switch (layout) {
     case tree_layout::depth_first:
       result = treelite_importer<tree_layout::depth_first>{}.import(
-        tl_model,
-        align_bytes,
-        use_double_precision,
-        dev_type,
-        device,
-        stream
-      );
+        tl_model, align_bytes, use_double_precision, dev_type, device, stream);
       break;
     case tree_layout::breadth_first:
       result = treelite_importer<tree_layout::breadth_first>{}.import(
-        tl_model,
-        align_bytes,
-        use_double_precision,
-        dev_type,
-        device,
-        stream
-      );
+        tl_model, align_bytes, use_double_precision, dev_type, device, stream);
       break;
   }
   return result;
@@ -734,26 +671,23 @@ auto import_from_treelite_model(
  * @param stream The CUDA stream to use for loading this model (can be
  * omitted for CPU).
  */
-auto import_from_treelite_handle(
-  ModelHandle tl_handle,
-  tree_layout layout=preferred_tree_layout,
-  index_type align_bytes = index_type{},
-  std::optional<bool> use_double_precision = std::nullopt,
-  raft_proto::device_type dev_type=raft_proto::device_type::cpu,
-  int device=0,
-  raft_proto::cuda_stream stream=raft_proto::cuda_stream{}
-) {
-  return import_from_treelite_model(
-    *static_cast<treelite::Model*>(tl_handle),
-    layout,
-    align_bytes,
-    use_double_precision,
-    dev_type,
-    device,
-    stream
-  );
+auto import_from_treelite_handle(ModelHandle tl_handle,
+                                 tree_layout layout                       = preferred_tree_layout,
+                                 index_type align_bytes                   = index_type{},
+                                 std::optional<bool> use_double_precision = std::nullopt,
+                                 raft_proto::device_type dev_type = raft_proto::device_type::cpu,
+                                 int device                       = 0,
+                                 raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+{
+  return import_from_treelite_model(*static_cast<treelite::Model*>(tl_handle),
+                                    layout,
+                                    align_bytes,
+                                    use_double_precision,
+                                    dev_type,
+                                    device,
+                                    stream);
 }
 
-}
-}
-}
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/include/cuml/genetic/node.h b/cpp/include/cuml/genetic/node.h
index b7dc4da0d4..c9b0629005 100644
--- a/cpp/include/cuml/genetic/node.h
+++ b/cpp/include/cuml/genetic/node.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,7 +86,7 @@ struct node {
     tanh,
     unary_end     = tanh,  // keep this to be the last unary function in the list
     functions_end = unary_end,
-  };  // enum type
+  };                       // enum type
 
   /**
    * @brief Default constructor for node
diff --git a/cpp/include/cuml/metrics/metrics.hpp b/cpp/include/cuml/metrics/metrics.hpp
index beef06c89a..8d4fceb28a 100644
--- a/cpp/include/cuml/metrics/metrics.hpp
+++ b/cpp/include/cuml/metrics/metrics.hpp
@@ -177,7 +177,8 @@ double adjusted_rand_index(const raft::handle_t& handle,
  *
  * The KL divergence tells us how well the probability distribution Q
  * approximates the probability distribution P
- * It is often also used as a 'distance metric' between two probability distributions (not symmetric)
+ * It is often also used as a 'distance metric' between two probability distributions (not
+ * symmetric)
  *
  * @param handle: raft::handle_t
  * @param y: Array of probabilities corresponding to distribution P
@@ -192,7 +193,8 @@ double kl_divergence(const raft::handle_t& handle, const double* y, const double
  *
  * The KL divergence tells us how well the probability distribution Q
  * approximates the probability distribution P
- * It is often also used as a 'distance metric' between two probability distributions (not symmetric)
+ * It is often also used as a 'distance metric' between two probability distributions (not
+ * symmetric)
  *
  * @param handle: raft::handle_t
  * @param y: Array of probabilities corresponding to distribution P
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index debaaaf46e..52b2033860 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1624,12 +1624,12 @@ void batched_jones_transform(raft::handle_t& handle,
   double* d_params            = arima_mem.d_params;
   double* d_Tparams           = arima_mem.d_Tparams;
   ARIMAParams<double> params  = {arima_mem.params_mu,
-                                arima_mem.params_beta,
-                                arima_mem.params_ar,
-                                arima_mem.params_ma,
-                                arima_mem.params_sar,
-                                arima_mem.params_sma,
-                                arima_mem.params_sigma2};
+                                 arima_mem.params_beta,
+                                 arima_mem.params_ar,
+                                 arima_mem.params_ma,
+                                 arima_mem.params_sar,
+                                 arima_mem.params_sma,
+                                 arima_mem.params_sigma2};
   ARIMAParams<double> Tparams = {params.mu,
                                  params.beta,
                                  arima_mem.Tparams_ar,
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index fef69b12f7..453110c099 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -285,7 +285,7 @@ struct Builder {
     d_wsize += calculateAlignedBytes(sizeof(IdxT) * max_batch * dataset.n_sampled_cols);  // colids
 
     // all nodes in the tree
-    h_wsize +=  // h_workload_info
+    h_wsize +=                                                     // h_workload_info
       calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks_dimx);
     h_wsize += calculateAlignedBytes(sizeof(SplitT) * max_batch);  // splits
 
diff --git a/cpp/src/experimental/fil/infer0.cpp b/cpp/src/experimental/fil/infer0.cpp
index 70c6760065..24e975f711 100644
--- a/cpp/src/experimental/fil/infer0.cpp
+++ b/cpp/src/experimental/fil/infer0.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 0)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer0.cu b/cpp/src/experimental/fil/infer0.cu
index 6dfd466507..8ad2a3fec0 100644
--- a/cpp/src/experimental/fil/infer0.cu
+++ b/cpp/src/experimental/fil/infer0.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 0)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 0)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer1.cpp b/cpp/src/experimental/fil/infer1.cpp
index 64276adc94..d2778f1362 100644
--- a/cpp/src/experimental/fil/infer1.cpp
+++ b/cpp/src/experimental/fil/infer1.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 1)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer1.cu b/cpp/src/experimental/fil/infer1.cu
index e01bb07163..f0e230a2dd 100644
--- a/cpp/src/experimental/fil/infer1.cu
+++ b/cpp/src/experimental/fil/infer1.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 1)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 1)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer2.cpp b/cpp/src/experimental/fil/infer2.cpp
index 6bc6e73a10..88bc868dde 100644
--- a/cpp/src/experimental/fil/infer2.cpp
+++ b/cpp/src/experimental/fil/infer2.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 2)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer2.cu b/cpp/src/experimental/fil/infer2.cu
index eb61097e3b..8af1e252b3 100644
--- a/cpp/src/experimental/fil/infer2.cu
+++ b/cpp/src/experimental/fil/infer2.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 2)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 2)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer3.cpp b/cpp/src/experimental/fil/infer3.cpp
index e7938821e1..fc5126f8b9 100644
--- a/cpp/src/experimental/fil/infer3.cpp
+++ b/cpp/src/experimental/fil/infer3.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 3)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer3.cu b/cpp/src/experimental/fil/infer3.cu
index 4777793d1b..ea7f5dc718 100644
--- a/cpp/src/experimental/fil/infer3.cu
+++ b/cpp/src/experimental/fil/infer3.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 3)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 3)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer4.cpp b/cpp/src/experimental/fil/infer4.cpp
index b204cdb3a8..0f5b63583f 100644
--- a/cpp/src/experimental/fil/infer4.cpp
+++ b/cpp/src/experimental/fil/infer4.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 4)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer4.cu b/cpp/src/experimental/fil/infer4.cu
index 50e8987702..b47cb95b17 100644
--- a/cpp/src/experimental/fil/infer4.cu
+++ b/cpp/src/experimental/fil/infer4.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 4)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 4)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer5.cpp b/cpp/src/experimental/fil/infer5.cpp
index 8b50064336..3f98d08cac 100644
--- a/cpp/src/experimental/fil/infer5.cpp
+++ b/cpp/src/experimental/fil/infer5.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 5)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer5.cu b/cpp/src/experimental/fil/infer5.cu
index d48c626bb1..81a478ac8c 100644
--- a/cpp/src/experimental/fil/infer5.cu
+++ b/cpp/src/experimental/fil/infer5.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 5)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 5)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer6.cpp b/cpp/src/experimental/fil/infer6.cpp
index 39df1e7702..26e5fe551e 100644
--- a/cpp/src/experimental/fil/infer6.cpp
+++ b/cpp/src/experimental/fil/infer6.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 6)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer6.cu b/cpp/src/experimental/fil/infer6.cu
index aa4ea858d5..44a8a9e759 100644
--- a/cpp/src/experimental/fil/infer6.cu
+++ b/cpp/src/experimental/fil/infer6.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 6)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 6)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer7.cpp b/cpp/src/experimental/fil/infer7.cpp
index ee56bb5137..81fd36e388 100644
--- a/cpp/src/experimental/fil/infer7.cpp
+++ b/cpp/src/experimental/fil/infer7.cpp
@@ -22,7 +22,7 @@ namespace detail {
 namespace inference {
 CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 7)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/experimental/fil/infer7.cu b/cpp/src/experimental/fil/infer7.cu
index a4276c227f..52662673c0 100644
--- a/cpp/src/experimental/fil/infer7.cu
+++ b/cpp/src/experimental/fil/infer7.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cuml/experimental/fil/detail/device_initialization/gpu.cuh>
-#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/infer/gpu.cuh>
+#include <cuml/experimental/fil/detail/specializations/device_initialization_macros.hpp>
 #include <cuml/experimental/fil/detail/specializations/infer_macros.hpp>
 namespace ML {
 namespace experimental {
@@ -27,7 +27,7 @@ CUML_FIL_INFER_ALL(template, raft_proto::device_type::gpu, 7)
 namespace device_initialization {
 CUML_FIL_INITIALIZE_DEVICE(template, 7)
 }
-}
-}
-}
-}
+}  // namespace detail
+}  // namespace fil
+}  // namespace experimental
+}  // namespace ML
diff --git a/cpp/src/explainer/tree_shap.cu b/cpp/src/explainer/tree_shap.cu
index 0dce3675af..926125d3ab 100644
--- a/cpp/src/explainer/tree_shap.cu
+++ b/cpp/src/explainer/tree_shap.cu
@@ -59,7 +59,7 @@ class BitField {
   __host__ __device__ explicit BitField(raft::span<T, is_device> bits) : bits_(bits) {}
   __host__ __device__ BitField(const BitField& other) : bits_(other.bits_) {}
   BitField& operator=(const BitField& other) = default;
-  BitField& operator=(BitField&& other) = default;
+  BitField& operator=(BitField&& other)      = default;
   __host__ __device__ bool Check(std::size_t pos) const
   {
     T bitmask = kOne << (pos % kValueSize);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 8721019529..6d7c611d6f 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
 /** @file fil.cu fil.cu implements the forest data types (dense and sparse), including their
 creation and prediction (the main inference kernel is defined in infer.cu). */
 
-#include "common.cuh"    // for predict_params, storage, storage
-#include "internal.cuh"  // for cat_sets_device_owner, categorical_sets, output_t,
+#include "common.cuh"                  // for predict_params, storage, storage
+#include "internal.cuh"                // for cat_sets_device_owner, categorical_sets, output_t,
 
-#include <cuml/fil/fil.h>  // for algo_t,
+#include <cuml/fil/fil.h>              // for algo_t,
 
 #include <raft/core/error.hpp>         // for ASSERT
 #include <raft/core/handle.hpp>        // for handle_t
@@ -28,9 +28,9 @@ creation and prediction (the main inference kernel is defined in infer.cu). */
 #include <rmm/device_uvector.hpp>      // for device_uvector
 #include <thrust/host_vector.h>        // for host_vector
 
-#include <cmath>    // for expf
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t
+#include <cmath>                       // for expf
+#include <cstddef>                     // for size_t
+#include <cstdint>                     // for uint8_t
 
 namespace ML {
 namespace fil {
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 4f18d0c072..0d10feac31 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -299,8 +299,7 @@ template <typename node_t>
 struct tree;
 
 template <leaf_algo_t leaf_algo, typename real_t>
-struct leaf_output_t {
-};
+struct leaf_output_t {};
 template <typename real_t>
 struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, real_t> {
   typedef real_t T;
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 533a3be4ba..80d7f2c0f1 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -22,7 +22,7 @@
 
 #include <cuml/common/logger.hpp>  // for CUML_LOG_WARN
 #include <cuml/fil/fil.h>  // for algo_t, from_treelite, storage_type_repr, storage_type_t, treelite_params_t
-#include <cuml/fil/fnv_hash.h>  // for fowler_noll_vo_fingerprint64_32
+#include <cuml/fil/fnv_hash.h>         // for fowler_noll_vo_fingerprint64_32
 
 #include <raft/core/error.hpp>         // for ASSERT
 #include <raft/core/handle.hpp>        // for handle_t
@@ -32,18 +32,18 @@
 #include <treelite/c_api.h>  // for ModelHandle
 #include <treelite/tree.h>   // for Tree, Model, ModelImpl, ModelParam
 
-#include <omp.h>  // for omp
+#include <omp.h>             // for omp
 
-#include <algorithm>    // for std::max
-#include <bitset>       // for std::bitset
-#include <cmath>        // for NAN
-#include <cstddef>      // for std::size_t
-#include <cstdint>      // for uint8_t
-#include <iosfwd>       // for ios, stringstream
-#include <limits>       // for std::numeric_limits
-#include <stack>        // for std::stack
-#include <string>       // for std::string
-#include <type_traits>  // for std::is_same
+#include <algorithm>         // for std::max
+#include <bitset>            // for std::bitset
+#include <cmath>             // for NAN
+#include <cstddef>           // for std::size_t
+#include <cstdint>           // for uint8_t
+#include <iosfwd>            // for ios, stringstream
+#include <limits>            // for std::numeric_limits
+#include <stack>             // for std::stack
+#include <string>            // for std::string
+#include <type_traits>       // for std::is_same
 
 namespace ML {
 namespace fil {
@@ -204,9 +204,9 @@ inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, const categorical_s
 template <typename T, typename L>
 cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl<T, L>& model)
 {
-#pragma omp declare reduction(cat_counter_vec_red : std::vector<cat_feature_counters> \
-      : elementwise_combine(omp_out, omp_in))                 \
-    initializer(omp_priv = omp_orig)
+#pragma omp declare reduction(                                                     \
+    cat_counter_vec_red : std::vector<cat_feature_counters> : elementwise_combine( \
+        omp_out, omp_in)) initializer(omp_priv = omp_orig)
   const auto& trees = model.trees;
   cat_sets_owner cat_sets;
   std::vector<cat_feature_counters> counters(model.num_feature);
diff --git a/cpp/src/genetic/fitness.cuh b/cpp/src/genetic/fitness.cuh
index 82e00c2e24..82d13203f3 100644
--- a/cpp/src/genetic/fitness.cuh
+++ b/cpp/src/genetic/fitness.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,18 +63,18 @@ void weightedPearson(const raft::handle_t& h,
   rmm::device_uvector<math_t> y_tmp(n_samples, stream);
   rmm::device_uvector<math_t> x_tmp(n_samples * n_progs, stream);
 
-  rmm::device_scalar<math_t> y_mu(stream);            // output mean
-  rmm::device_uvector<math_t> x_mu(n_progs, stream);  // predicted output mean
+  rmm::device_scalar<math_t> y_mu(stream);                // output mean
+  rmm::device_uvector<math_t> x_mu(n_progs, stream);      // predicted output mean
 
   rmm::device_uvector<math_t> y_diff(n_samples, stream);  // normalized output
   rmm::device_uvector<math_t> x_diff(n_samples * n_progs,
-                                     stream);  // normalized predicted output
+                                     stream);             // normalized predicted output
 
-  rmm::device_uvector<math_t> y_std(1, stream);  // output stddev
+  rmm::device_uvector<math_t> y_std(1, stream);           // output stddev
   rmm::device_uvector<math_t> x_std(n_progs,
-                                    stream);  // predicted output stddev
+                                    stream);              // predicted output stddev
 
-  rmm::device_scalar<math_t> dWS(stream);  // sample weight sum
+  rmm::device_scalar<math_t> dWS(stream);                 // sample weight sum
   math_t N = (math_t)n_samples;
 
   // Sum of weights
diff --git a/cpp/src/genetic/genetic.cu b/cpp/src/genetic/genetic.cu
index 3cc6680ce5..661332fb91 100644
--- a/cpp/src/genetic/genetic.cu
+++ b/cpp/src/genetic/genetic.cu
@@ -370,8 +370,8 @@ void symFit(const raft::handle_t& handle,
 {
   cudaStream_t stream = handle.get_stream();
 
-  // Update arity map in params - Need to do this only here, as all operations will call Fit at least
-  // once
+  // Update arity map in params - Need to do this only here, as all operations will call Fit at
+  // least once
   for (auto f : params.function_set) {
     int ar = 1;
     if (node::type::binary_begin <= f && f <= node::type::binary_end) { ar = 2; }
diff --git a/cpp/src/glm/qn/glm_base.cuh b/cpp/src/glm/qn/glm_base.cuh
index d1e6ef37c6..a63f4ce0a2 100644
--- a/cpp/src/glm/qn/glm_base.cuh
+++ b/cpp/src/glm/qn/glm_base.cuh
@@ -192,7 +192,7 @@ struct GLMBase : GLMDims {
                         cudaStream_t stream,
                         bool initGradZero = true)
   {
-    Loss* loss = static_cast<Loss*>(this);  // static polymorphism
+    Loss* loss = static_cast<Loss*>(this);         // static polymorphism
 
     linearFwd(handle, Zb, Xb, W);                  // linear part: forward pass
     loss->getLossAndDZ(loss_val, Zb, yb, stream);  // loss specific part
diff --git a/cpp/src/glm/qn/qn_util.cuh b/cpp/src/glm/qn/qn_util.cuh
index 55f594e17c..2af4522915 100644
--- a/cpp/src/glm/qn/qn_util.cuh
+++ b/cpp/src/glm/qn/qn_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,19 +53,19 @@ enum OPT_RETCODE {
 template <typename T = double>
 class LBFGSParam {
  public:
-  int m;      // lbfgs memory limit
-  T epsilon;  // controls convergence
-  int past;   // lookback for function value based convergence test
-  T delta;    // controls fun val based conv test
+  int m;           // lbfgs memory limit
+  T epsilon;       // controls convergence
+  int past;        // lookback for function value based convergence test
+  T delta;         // controls fun val based conv test
   int max_iterations;
   int linesearch;  // see enum above
   int max_linesearch;
-  T min_step;  // min. allowed step length
-  T max_step;  // max. allowed step length
-  T ftol;      // line  search tolerance
-  T wolfe;     // wolfe parameter
-  T ls_dec;    // line search decrease factor
-  T ls_inc;    // line search increase factor
+  T min_step;      // min. allowed step length
+  T max_step;      // max. allowed step length
+  T ftol;          // line  search tolerance
+  T wolfe;         // wolfe parameter
+  T ls_dec;        // line search decrease factor
+  T ls_inc;        // line search increase factor
 
  public:
   LBFGSParam()
diff --git a/cpp/src/hdbscan/detail/soft_clustering.cuh b/cpp/src/hdbscan/detail/soft_clustering.cuh
index 10085d0e49..2268aa9511 100644
--- a/cpp/src/hdbscan/detail/soft_clustering.cuh
+++ b/cpp/src/hdbscan/detail/soft_clustering.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cuml/common/logger.hpp>
 #include "kernels/soft_clustering.cuh"
 #include "select.cuh"
 #include "utils.h"
+#include <cuml/common/logger.hpp>
 
 #include <cub/cub.cuh>
 
@@ -54,7 +54,8 @@ namespace HDBSCAN {
 namespace detail {
 namespace Predict {
 
-// Computing distance based membership for points in the original clustering on which the clusterer was trained and new points outside of the training data.
+// Computing distance based membership for points in the original clustering on which the clusterer
+// was trained and new points outside of the training data.
 template <typename value_idx, typename value_t>
 void dist_membership_vector(const raft::handle_t& handle,
                             const value_t* X,
@@ -84,79 +85,98 @@ void dist_membership_vector(const raft::handle_t& handle,
 
   n_batches = raft::ceildiv((int)n_queries, (int)batch_size);
 
-  for(value_idx bid = 0; bid < n_batches; bid++) {
-    value_idx batch_offset = bid * batch_size;
+  for (value_idx bid = 0; bid < n_batches; bid++) {
+    value_idx batch_offset      = bid * batch_size;
     value_idx samples_per_batch = min((value_idx)batch_size, (value_idx)n_queries - batch_offset);
     rmm::device_uvector<value_t> dist(samples_per_batch * n_exemplars, stream);
 
-     // compute the distances using raft API
+    // compute the distances using raft API
     switch (metric) {
       case raft::distance::DistanceType::L2SqrtExpanded:
         raft::distance::
           distance<raft::distance::DistanceType::L2SqrtExpanded, value_t, value_t, value_t, int>(
-          handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    case raft::distance::DistanceType::L1:
-      raft::distance::distance<raft::distance::DistanceType::L1, value_t, value_t, value_t, int>(
-        handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      raft::distance::
-        distance<raft::distance::DistanceType::CosineExpanded, value_t, value_t, value_t, int>(
-          handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    default: ASSERT(false, "Incorrect metric passed!");
-  }
+            handle,
+            query + batch_offset * n,
+            exemplars_dense.data(),
+            dist.data(),
+            samples_per_batch,
+            n_exemplars,
+            n,
+            true);
+        break;
+      case raft::distance::DistanceType::L1:
+        raft::distance::distance<raft::distance::DistanceType::L1, value_t, value_t, value_t, int>(
+          handle,
+          query + batch_offset * n,
+          exemplars_dense.data(),
+          dist.data(),
+          samples_per_batch,
+          n_exemplars,
+          n,
+          true);
+        break;
+      case raft::distance::DistanceType::CosineExpanded:
+        raft::distance::
+          distance<raft::distance::DistanceType::CosineExpanded, value_t, value_t, value_t, int>(
+            handle,
+            query + batch_offset * n,
+            exemplars_dense.data(),
+            dist.data(),
+            samples_per_batch,
+            n_exemplars,
+            n,
+            true);
+        break;
+      default: ASSERT(false, "Incorrect metric passed!");
+    }
 
-  // compute the minimum distances to exemplars of each cluster
-  value_idx n_elements = samples_per_batch * n_selected_clusters;
-  auto min_dist = raft::make_device_vector<value_t, value_idx>(handle, n_elements);
+    // compute the minimum distances to exemplars of each cluster
+    value_idx n_elements = samples_per_batch * n_selected_clusters;
+    auto min_dist        = raft::make_device_vector<value_t, value_idx>(handle, n_elements);
 
-  auto reduction_op = [dist = dist.data(),
-                       batch_offset,
-                       divisor = raft::util::FastIntDiv(n_selected_clusters),
-                       n_selected_clusters,
-                       n_exemplars,
-                       exemplar_label_offsets] __device__(auto idx) {
-    auto col   = idx % divisor;
-    auto row   = idx / divisor;
-    auto start = exemplar_label_offsets[col];
-    auto end   = exemplar_label_offsets[col + 1];
-
-    value_t min_val = std::numeric_limits<value_t>::max();
-    for (value_idx i = start; i < end; i++) {
-      if (dist[row * n_exemplars + i] < min_val) {
-        min_val = dist[row * n_exemplars + i];
+    auto reduction_op = [dist = dist.data(),
+                         batch_offset,
+                         divisor = raft::util::FastIntDiv(n_selected_clusters),
+                         n_selected_clusters,
+                         n_exemplars,
+                         exemplar_label_offsets] __device__(auto idx) {
+      auto col   = idx % divisor;
+      auto row   = idx / divisor;
+      auto start = exemplar_label_offsets[col];
+      auto end   = exemplar_label_offsets[col + 1];
+
+      value_t min_val = std::numeric_limits<value_t>::max();
+      for (value_idx i = start; i < end; i++) {
+        if (dist[row * n_exemplars + i] < min_val) { min_val = dist[row * n_exemplars + i]; }
       }
-    }
-    return min_val;
-  };
-
-  raft::linalg::map_offset(handle, min_dist.view(), reduction_op);
+      return min_val;
+    };
 
-  // Softmax computation is ignored in distance membership
-  if (softmax) {
-    thrust::transform(exec_policy,
-                      min_dist.data_handle(),
-                      min_dist.data_handle() + samples_per_batch * n_selected_clusters,
-                      dist_membership_vec + batch_offset * n_selected_clusters,
-                      [=] __device__(value_t val) {
-                        if (val != 0) { return value_t(exp(1.0 / val)); }
-                        return std::numeric_limits<value_t>::max();
-                      });
-  }
+    raft::linalg::map_offset(handle, min_dist.view(), reduction_op);
+
+    // Softmax computation is ignored in distance membership
+    if (softmax) {
+      thrust::transform(exec_policy,
+                        min_dist.data_handle(),
+                        min_dist.data_handle() + samples_per_batch * n_selected_clusters,
+                        dist_membership_vec + batch_offset * n_selected_clusters,
+                        [=] __device__(value_t val) {
+                          if (val != 0) { return value_t(exp(1.0 / val)); }
+                          return std::numeric_limits<value_t>::max();
+                        });
+    }
 
-  // Transform the distances to obtain membership based on proximity to exemplars
-  else {
-    thrust::transform(exec_policy,
-                      min_dist.data_handle(),
-                      min_dist.data_handle() + samples_per_batch * n_selected_clusters,
-                      dist_membership_vec + batch_offset * n_selected_clusters,
-                      [=] __device__(value_t val) {
-                        if (val > 0) { return value_t(1.0 / val); }
-                        return std::numeric_limits<value_t>::max() / n_selected_clusters;
-                      });
-  }
+    // Transform the distances to obtain membership based on proximity to exemplars
+    else {
+      thrust::transform(exec_policy,
+                        min_dist.data_handle(),
+                        min_dist.data_handle() + samples_per_batch * n_selected_clusters,
+                        dist_membership_vec + batch_offset * n_selected_clusters,
+                        [=] __device__(value_t val) {
+                          if (val > 0) { return value_t(1.0 / val); }
+                          return std::numeric_limits<value_t>::max() / n_selected_clusters;
+                        });
+    }
   }
   // Normalize the obtained result to sum to 1.0
   Utils::normalize(dist_membership_vec, n_selected_clusters, n_queries, stream);
@@ -198,13 +218,10 @@ void all_points_outlier_membership_vector(
   auto leaf_max_lambdas = raft::make_device_vector<value_t, value_idx>(handle, n_leaves);
 
   raft::linalg::map_offset(handle,
-                  leaf_max_lambdas.view(),
-                   [deaths,
-                    parents,
-                    index_into_children,
-                    n_leaves] __device__(auto idx) {
-                     return deaths[parents[index_into_children[idx]] - n_leaves];
-                   });
+                           leaf_max_lambdas.view(),
+                           [deaths, parents, index_into_children, n_leaves] __device__(auto idx) {
+                             return deaths[parents[index_into_children[idx]] - n_leaves];
+                           });
 
   raft::linalg::matrixVectorOp(
     outlier_membership_vec,
@@ -245,19 +262,29 @@ void all_points_prob_in_some_cluster(const raft::handle_t& handle,
 
   auto height_argmax = raft::make_device_vector<value_idx, value_idx>(handle, m);
 
-  auto merge_heights_view = raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(merge_heights, (int)m, n_selected_clusters);
-
-  raft::matrix::argmax(
-    handle, merge_heights_view, height_argmax.view());
-
-  auto prob_in_some_cluster_op = [deaths, lambdas, index_into_children, selected_clusters, n_leaves, merge_heights, height_argmax = height_argmax.data_handle(), n_selected_clusters]__device__(auto idx) {
-      value_idx nearest_cluster = height_argmax[idx];
-    value_t max_lambda = max(lambdas[index_into_children[idx]],
-                             deaths[selected_clusters[nearest_cluster] - n_leaves]);
+  auto merge_heights_view =
+    raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(
+      merge_heights, (int)m, n_selected_clusters);
+
+  raft::matrix::argmax(handle, merge_heights_view, height_argmax.view());
+
+  auto prob_in_some_cluster_op = [deaths,
+                                  lambdas,
+                                  index_into_children,
+                                  selected_clusters,
+                                  n_leaves,
+                                  merge_heights,
+                                  height_argmax = height_argmax.data_handle(),
+                                  n_selected_clusters] __device__(auto idx) {
+    value_idx nearest_cluster = height_argmax[idx];
+    value_t max_lambda =
+      max(lambdas[index_into_children[idx]], deaths[selected_clusters[nearest_cluster] - n_leaves]);
     return merge_heights[idx * n_selected_clusters + nearest_cluster] / max_lambda;
-    };
-  raft::linalg::map_offset(handle, raft::make_device_vector_view<value_t, value_idx>(prob_in_some_cluster, m), prob_in_some_cluster_op);
-
+  };
+  raft::linalg::map_offset(
+    handle,
+    raft::make_device_vector_view<value_t, value_idx>(prob_in_some_cluster, m),
+    prob_in_some_cluster_op);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
@@ -300,14 +327,14 @@ void outlier_membership_vector(const raft::handle_t& handle,
   // fetch the max lambda of the cluster to which the nearest MR neighbor belongs in the condensed
   // hierarchy
 
-  auto nearest_cluster_max_lambda = raft::make_device_vector<value_t, value_idx>(handle, n_prediction_points);
-  raft::linalg::map_offset(handle, nearest_cluster_max_lambda.view(), [deaths,
-                    parents,
-                    index_into_children,
-                    min_mr_inds,
-                    n_leaves] __device__(auto idx) {
-                     return deaths[parents[index_into_children[min_mr_inds[idx]]] - n_leaves];
-                   });
+  auto nearest_cluster_max_lambda =
+    raft::make_device_vector<value_t, value_idx>(handle, n_prediction_points);
+  raft::linalg::map_offset(
+    handle,
+    nearest_cluster_max_lambda.view(),
+    [deaths, parents, index_into_children, min_mr_inds, n_leaves] __device__(auto idx) {
+      return deaths[parents[index_into_children[min_mr_inds[idx]]] - n_leaves];
+    });
 
   raft::linalg::matrixVectorOp(
     outlier_membership_vec,
@@ -351,19 +378,30 @@ void prob_in_some_cluster(const raft::handle_t& handle,
   auto n_edges     = condensed_tree.get_n_edges();
   auto children    = condensed_tree.get_children();
 
-  auto height_argmax = raft::make_device_vector<value_idx, value_idx> (handle, n_prediction_points);
+  auto height_argmax = raft::make_device_vector<value_idx, value_idx>(handle, n_prediction_points);
+
+  auto merge_heights_view =
+    raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(
+      merge_heights, (int)n_prediction_points, n_selected_clusters);
 
-  auto merge_heights_view = raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(merge_heights, (int)n_prediction_points, n_selected_clusters);
+  raft::matrix::argmax(handle, merge_heights_view, height_argmax.view());
 
-  raft::matrix::argmax(
-    handle, merge_heights_view, height_argmax.view());
-  
-    auto prob_in_some_cluster_op = [prediction_lambdas, deaths, selected_clusters, n_leaves, merge_heights, height_argmax = height_argmax.data_handle(), n_selected_clusters]__device__(auto idx) {
-      value_idx nearest_cluster = height_argmax[idx];
-      value_t max_lambda = max(prediction_lambdas[idx], deaths[selected_clusters[nearest_cluster] - n_leaves]) + 1e-8;
+  auto prob_in_some_cluster_op = [prediction_lambdas,
+                                  deaths,
+                                  selected_clusters,
+                                  n_leaves,
+                                  merge_heights,
+                                  height_argmax = height_argmax.data_handle(),
+                                  n_selected_clusters] __device__(auto idx) {
+    value_idx nearest_cluster = height_argmax[idx];
+    value_t max_lambda =
+      max(prediction_lambdas[idx], deaths[selected_clusters[nearest_cluster] - n_leaves]) + 1e-8;
     return merge_heights[idx * n_selected_clusters + nearest_cluster] / max_lambda;
-    };
-  raft::linalg::map_offset(handle, raft::make_device_vector_view<value_t, value_idx>(prob_in_some_cluster, n_prediction_points), prob_in_some_cluster_op);
+  };
+  raft::linalg::map_offset(
+    handle,
+    raft::make_device_vector_view<value_t, value_idx>(prob_in_some_cluster, n_prediction_points),
+    prob_in_some_cluster_op);
 }
 
 /**
@@ -396,7 +434,9 @@ void all_points_membership_vectors(const raft::handle_t& handle,
   size_t n = prediction_data.n_cols;
 
   if (batch_size > m) batch_size = m;
-  RAFT_EXPECTS(0 < batch_size && batch_size <= m, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");
+  RAFT_EXPECTS(0 < batch_size && batch_size <= m,
+               "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the "
+               "training data");
 
   auto parents    = condensed_tree.get_parents();
   auto children   = condensed_tree.get_children();
@@ -479,7 +519,7 @@ void all_points_membership_vectors(const raft::handle_t& handle,
 }
 
 /**
- * Predict soft cluster membership vectors for new points (not in the training data). 
+ * Predict soft cluster membership vectors for new points (not in the training data).
  *
  * @tparam value_idx
  * @tparam value_t
@@ -491,8 +531,9 @@ void all_points_membership_vectors(const raft::handle_t& handle,
  * @param[in] n_prediction_points number of prediction points
  * @param[in] metric distance metric
  * @param[in] min_samples neighborhood size during training (includes self-loop)
- * @param[out] membership_vec output membership vectors (size n_prediction_points * n_selected_clusters)
-  * @param[in] batch_size batch size to be used while computing distance based memberships
+ * @param[out] membership_vec output membership vectors (size n_prediction_points *
+ * n_selected_clusters)
+ * @param[in] batch_size batch size to be used while computing distance based memberships
  */
 template <typename value_idx, typename value_t, int tpb = 256>
 void membership_vector(const raft::handle_t& handle,
@@ -522,7 +563,9 @@ void membership_vector(const raft::handle_t& handle,
   value_t* lambdas               = condensed_tree.get_lambdas();
 
   if (batch_size > n_prediction_points) batch_size = n_prediction_points;
-  RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");
+  RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points,
+               "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the "
+               "training data");
 
   rmm::device_uvector<value_t> dist_membership_vec(n_prediction_points * n_selected_clusters,
                                                    stream);
@@ -540,7 +583,8 @@ void membership_vector(const raft::handle_t& handle,
                          raft::distance::DistanceType::L2SqrtExpanded,
                          batch_size);
 
-  auto prediction_lambdas = raft::make_device_vector<value_t, value_idx>(handle, n_prediction_points);
+  auto prediction_lambdas =
+    raft::make_device_vector<value_t, value_idx>(handle, n_prediction_points);
   rmm::device_uvector<value_idx> min_mr_inds(n_prediction_points, stream);
 
   _compute_knn_and_nearest_neighbor(handle,
@@ -553,15 +597,16 @@ void membership_vector(const raft::handle_t& handle,
                                     prediction_lambdas.data_handle(),
                                     metric);
 
-  raft::linalg::map_offset(handle,
-                   prediction_lambdas.view(),
-                   [lambdas,
-                    index_into_children,
-                    min_mr_inds        = min_mr_inds.data(),
-                    prediction_lambdas = prediction_lambdas.data_handle()] __device__(auto idx) {
-                     value_t neighbor_lambda = lambdas[index_into_children[min_mr_inds[idx]]];
-                       return min(prediction_lambdas[idx], neighbor_lambda);
-                   });
+  raft::linalg::map_offset(
+    handle,
+    prediction_lambdas.view(),
+    [lambdas,
+     index_into_children,
+     min_mr_inds        = min_mr_inds.data(),
+     prediction_lambdas = prediction_lambdas.data_handle()] __device__(auto idx) {
+      value_t neighbor_lambda = lambdas[index_into_children[min_mr_inds[idx]]];
+      return min(prediction_lambdas[idx], neighbor_lambda);
+    });
 
   rmm::device_uvector<value_t> merge_heights(n_prediction_points * n_selected_clusters, stream);
 
@@ -583,8 +628,10 @@ void membership_vector(const raft::handle_t& handle,
     return pow(membership_vec[idx], 2) * pow(dist_membership_vec[idx], 0.5);
   };
 
-  raft::linalg::map_offset(
-    handle, raft::make_device_vector_view<value_t, value_idx>(membership_vec, n_prediction_points * n_selected_clusters), combine_op);
+  raft::linalg::map_offset(handle,
+                           raft::make_device_vector_view<value_t, value_idx>(
+                             membership_vec, n_prediction_points * n_selected_clusters),
+                           combine_op);
 
   // Normalize to obtain probabilities conditioned on points belonging to some cluster
   Utils::normalize(membership_vec, n_selected_clusters, n_prediction_points, stream);
diff --git a/cpp/src/hdbscan/detail/utils.h b/cpp/src/hdbscan/detail/utils.h
index 678ea0f0df..23171e540a 100644
--- a/cpp/src/hdbscan/detail/utils.h
+++ b/cpp/src/hdbscan/detail/utils.h
@@ -216,7 +216,8 @@ void normalize(value_t* data, value_idx n, size_t m, cudaStream_t stream)
 }
 
 /**
- * Computes softmax (unnormalized). The input is modified in-place. For numerical stability, the maximum value of a row is subtracted from the exponent.
+ * Computes softmax (unnormalized). The input is modified in-place. For numerical stability, the
+ * maximum value of a row is subtracted from the exponent.
  * @tparam value_idx
  * @tparam value_t
  * @param[in] handle raft handle for resource reuse
@@ -229,16 +230,27 @@ void softmax(const raft::handle_t& handle, value_t* data, value_idx n, size_t m)
 {
   rmm::device_uvector<value_t> linf_norm(m, handle.get_stream());
 
-  auto data_const_view = raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(data, (int)m, n);
-  auto data_view = raft::make_device_matrix_view<value_t, value_idx, raft::row_major>(data, (int)m, n);
-  auto linf_norm_const_view = raft::make_device_vector_view<const value_t, value_idx>(linf_norm.data(), (int)m);
+  auto data_const_view =
+    raft::make_device_matrix_view<const value_t, value_idx, raft::row_major>(data, (int)m, n);
+  auto data_view =
+    raft::make_device_matrix_view<value_t, value_idx, raft::row_major>(data, (int)m, n);
+  auto linf_norm_const_view =
+    raft::make_device_vector_view<const value_t, value_idx>(linf_norm.data(), (int)m);
   auto linf_norm_view = raft::make_device_vector_view<value_t, value_idx>(linf_norm.data(), (int)m);
 
-  raft::linalg::norm(handle, data_const_view, linf_norm_view, raft::linalg::LinfNorm, raft::linalg::Apply::ALONG_ROWS);
-
-  raft::linalg::matrix_vector_op(handle, data_const_view, linf_norm_const_view, data_view, raft::linalg::Apply::ALONG_COLUMNS, [] __device__(value_t mat_in, value_t vec_in) {
-      return exp(mat_in - vec_in);
-    });
+  raft::linalg::norm(handle,
+                     data_const_view,
+                     linf_norm_view,
+                     raft::linalg::LinfNorm,
+                     raft::linalg::Apply::ALONG_ROWS);
+
+  raft::linalg::matrix_vector_op(
+    handle,
+    data_const_view,
+    linf_norm_const_view,
+    data_view,
+    raft::linalg::Apply::ALONG_COLUMNS,
+    [] __device__(value_t mat_in, value_t vec_in) { return exp(mat_in - vec_in); });
 }
 
 };  // namespace Utils
diff --git a/cpp/src/hierarchy/linkage.cu b/cpp/src/hierarchy/linkage.cu
index 8deaf4e31b..a7f92b5968 100644
--- a/cpp/src/hierarchy/linkage.cu
+++ b/cpp/src/hierarchy/linkage.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,11 +49,9 @@ void single_linkage_neighbors(const raft::handle_t& handle,
 
 struct distance_graph_impl_int_float
   : public raft::cluster::detail::
-      distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, int, float> {
-};
+      distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, int, float> {};
 struct distance_graph_impl_int_double
   : public raft::cluster::detail::
-      distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, int, double> {
-};
+      distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, int, double> {};
 
 };  // end namespace ML
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index 14191c5b02..dfc2cffe18 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -97,9 +97,9 @@ struct opg_knn_param {
   size_t batch_size = 0;                                  /**< Batch size */
   bool verbose;                                           /**< verbose */
 
-  std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
-  std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
-  std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
+  std::size_t n_outputs = 0;                  /**< Number of outputs per query (cl&re) */
+  std::vector<std::vector<out_t*>>* y;        /**< Labels input array (cl&re) */
+  std::vector<Matrix::Data<out_t>*>* out;     /**< KNN outputs output array (cl&re) */
 
   std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
   std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */
diff --git a/cpp/src/solver/sgd.cuh b/cpp/src/solver/sgd.cuh
index ccadaae47f..c1494c0b75 100644
--- a/cpp/src/solver/sgd.cuh
+++ b/cpp/src/solver/sgd.cuh
@@ -70,8 +70,8 @@ using namespace MLCommon;
  * @param lr_type
  *        type of the learning rate function (i.e. OPTIMAL, CONSTANT, INVSCALING, ADAPTIVE)
  * @param eta0
- *        learning rate for constant lr_type. It's used to calculate learning rate function for other
- * types of lr_type
+ *        learning rate for constant lr_type. It's used to calculate learning rate function for
+ * other types of lr_type
  * @param power_t
  *        power value in the INVSCALING lr_type
  * @param loss
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index 0ed1a0ff9d..13853112f5 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -320,10 +320,10 @@ class KernelCache {
 
   rmm::device_uvector<math_t> tile;  //!< Kernel matrix  tile
 
-  int n_rows;    //!< number of rows in x
-  int n_cols;    //!< number of columns in x
-  int n_ws;      //!< number of elements in the working set
-  int n_unique;  //!< number of unique x vectors in the working set
+  int n_rows;                        //!< number of rows in x
+  int n_cols;                        //!< number of columns in x
+  int n_ws;                          //!< number of elements in the working set
+  int n_unique;                      //!< number of unique x vectors in the working set
 
   cublasHandle_t cublas_handle;
 
diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
index c03ccef9f5..f4e502cc17 100644
--- a/cpp/src/svm/results.cuh
+++ b/cpp/src/svm/results.cuh
@@ -283,13 +283,13 @@ class Results {
   const raft::handle_t& handle;
   cudaStream_t stream;
 
-  int n_rows;       //!< number of rows in the training vector matrix
-  int n_cols;       //!< number of features
-  const math_t* x;  //!< training vectors
-  const math_t* y;  //!< labels
-  const math_t* C;  //!< penalty parameter
-  SvmType svmType;  //!< SVM problem type: SVC or SVR
-  int n_train;      //!< number of training vectors (including duplicates for SVR)
+  int n_rows;           //!< number of rows in the training vector matrix
+  int n_cols;           //!< number of features
+  const math_t* x;      //!< training vectors
+  const math_t* y;      //!< labels
+  const math_t* C;      //!< penalty parameter
+  SvmType svmType;      //!< SVM problem type: SVC or SVR
+  int n_train;          //!< number of training vectors (including duplicates for SVR)
 
   const int TPB = 256;  // threads per block
   // Temporary variables used by cub in GetResults
diff --git a/cpp/src/svm/smosolver.cuh b/cpp/src/svm/smosolver.cuh
index adf5cd7dd6..fe264cce87 100644
--- a/cpp/src/svm/smosolver.cuh
+++ b/cpp/src/svm/smosolver.cuh
@@ -397,7 +397,7 @@ class SmoSolver {
   rmm::device_uvector<math_t> f;        //!< optimality indicator vector
   rmm::device_uvector<math_t> y_label;  //!< extra label for regression
 
-  rmm::device_uvector<math_t> C_vec;  //!< penalty parameter vector
+  rmm::device_uvector<math_t> C_vec;    //!< penalty parameter vector
 
   // Buffers for the working set [n_ws]
   //! change in alpha parameter during a blocksolve step
@@ -415,7 +415,7 @@ class SmoSolver {
   raft::distance::kernels::GramMatrixBase<math_t>* kernel;
   float cache_size;  //!< size of kernel cache in MiB
 
-  SvmType svmType;  ///!< Type of the SVM problem to solve
+  SvmType svmType;   ///!< Type of the SVM problem to solve
 
   // Variables to track convergence of training
   math_t diff_prev;
diff --git a/cpp/src/tsne/barnes_hut_kernels.cuh b/cpp/src/tsne/barnes_hut_kernels.cuh
index 5059c1a8f1..67e3e46a7e 100644
--- a/cpp/src/tsne/barnes_hut_kernels.cuh
+++ b/cpp/src/tsne/barnes_hut_kernels.cuh
@@ -542,7 +542,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(value_idx* restr
  */
 template <typename value_idx, typename value_t>
 __global__ __launch_bounds__(
-  THREADS5, 1) void RepulsionKernel(/* int *restrict errd, */
+  THREADS5, 1) void RepulsionKernel(                     /* int *restrict errd, */
                                     const float theta,
                                     const float epssqd,  // correction for zero distance
                                     const value_idx* restrict sortd,
diff --git a/cpp/src/tsne/cannylab/bh.cu b/cpp/src/tsne/cannylab/bh.cu
index d280ae6f76..59ac35ae33 100644
--- a/cpp/src/tsne/cannylab/bh.cu
+++ b/cpp/src/tsne/cannylab/bh.cu
@@ -38,17 +38,16 @@ Tree-based Barnes Hut n-Body Algorithm. Chapter 6 in GPU Computing Gems
 Emerald Edition, pp. 75-92. January 2011.
 */
 
-
-#include <stdlib.h>
-#include <stdio.h>
+#include <cuda.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <sys/time.h>
-#include <cuda.h>
 
 // threads per block
-#define THREADS1 1024  /* must be a power of 2 */
+#define THREADS1 1024 /* must be a power of 2 */
 #define THREADS2 1024
-#define THREADS3 768   /* shared-memory limited on some devices */
+#define THREADS3 768  /* shared-memory limited on some devices */
 #define THREADS4 1024
 #define THREADS5 1024
 #define THREADS6 1024
@@ -56,8 +55,8 @@ Emerald Edition, pp. 75-92. January 2011.
 // block count = factor * #SMs
 #define FACTOR1 2
 #define FACTOR2 2
-#define FACTOR3 1  /* must all be resident at the same time */
-#define FACTOR4 1  /* must all be resident at the same time */
+#define FACTOR3 1 /* must all be resident at the same time */
+#define FACTOR4 1 /* must all be resident at the same time */
 #define FACTOR5 2
 #define FACTOR6 2
 
@@ -68,29 +67,33 @@ __device__ volatile int stepd, bottomd;
 __device__ unsigned int blkcntd;
 __device__ volatile float radiusd;
 
-
 /******************************************************************************/
 /*** initialize memory ********************************************************/
 /******************************************************************************/
 
 __global__ void InitializationKernel()
 {
-  stepd = -1;
+  stepd   = -1;
   blkcntd = 0;
 }
 
-
 /******************************************************************************/
 /*** compute center and radius ************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(THREADS1, FACTOR1)
-void BoundingBoxKernel(const int nnodesd, const int nbodiesd, int* const __restrict__ startd, int* const __restrict__ childd, float4* const __restrict__ posMassd, float3* const __restrict__ maxd, float3* const __restrict__ mind)
+__global__ __launch_bounds__(THREADS1,
+                             FACTOR1) void BoundingBoxKernel(const int nnodesd,
+                                                             const int nbodiesd,
+                                                             int* const __restrict__ startd,
+                                                             int* const __restrict__ childd,
+                                                             float4* const __restrict__ posMassd,
+                                                             float3* const __restrict__ maxd,
+                                                             float3* const __restrict__ mind)
 {
   int i, j, k, inc;
   float val;
-  __shared__ volatile float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1], sminz[THREADS1], smaxz[THREADS1];
+  __shared__ volatile float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1],
+    sminz[THREADS1], smaxz[THREADS1];
   float3 min, max;
 
   // initialize with valid data (in case #bodies < #threads)
@@ -100,19 +103,19 @@ void BoundingBoxKernel(const int nnodesd, const int nbodiesd, int* const __restr
   min.z = max.z = p0.z;
 
   // scan all bodies
-  i = threadIdx.x;
+  i   = threadIdx.x;
   inc = THREADS1 * gridDim.x;
   for (j = i + blockIdx.x * THREADS1; j < nbodiesd; j += inc) {
     const float4 p = posMassd[j];
-    val = p.x;
-    min.x = fminf(min.x, val);
-    max.x = fmaxf(max.x, val);
-    val = p.y;
-    min.y = fminf(min.y, val);
-    max.y = fmaxf(max.y, val);
-    val = p.z;
-    min.z = fminf(min.z, val);
-    max.z = fmaxf(max.z, val);
+    val            = p.x;
+    min.x          = fminf(min.x, val);
+    max.x          = fmaxf(max.x, val);
+    val            = p.y;
+    min.y          = fminf(min.y, val);
+    max.y          = fmaxf(max.y, val);
+    val            = p.z;
+    min.z          = fminf(min.z, val);
+    max.z          = fmaxf(max.z, val);
   }
 
   // reduction in shared memory
@@ -126,7 +129,7 @@ void BoundingBoxKernel(const int nnodesd, const int nbodiesd, int* const __restr
   for (j = THREADS1 / 2; j > 0; j /= 2) {
     __syncthreads();
     if (i < j) {
-      k = i + j;
+      k        = i + j;
       sminx[i] = min.x = fminf(min.x, sminx[k]);
       smaxx[i] = max.x = fmaxf(max.x, smaxx[k]);
       sminy[i] = min.y = fminf(min.y, sminy[k]);
@@ -138,7 +141,7 @@ void BoundingBoxKernel(const int nnodesd, const int nbodiesd, int* const __restr
 
   // write block result to global memory
   if (i == 0) {
-    k = blockIdx.x;
+    k       = blockIdx.x;
     mind[k] = min;
     maxd[k] = max;
     __threadfence();
@@ -149,52 +152,52 @@ void BoundingBoxKernel(const int nnodesd, const int nbodiesd, int* const __restr
       for (j = 0; j <= inc; j++) {
         float3 minp = mind[j];
         float3 maxp = maxd[j];
-        min.x = fminf(min.x, minp.x);
-        max.x = fmaxf(max.x, maxp.x);
-        min.y = fminf(min.y, minp.y);
-        max.y = fmaxf(max.y, maxp.y);
-        min.z = fminf(min.z, minp.z);
-        max.z = fmaxf(max.z, maxp.z);
+        min.x       = fminf(min.x, minp.x);
+        max.x       = fmaxf(max.x, maxp.x);
+        min.y       = fminf(min.y, minp.y);
+        max.y       = fmaxf(max.y, maxp.y);
+        min.z       = fminf(min.z, minp.z);
+        max.z       = fmaxf(max.z, maxp.z);
       }
 
       // compute radius
-      val = fmaxf(max.x - min.x, max.y - min.y);
+      val     = fmaxf(max.x - min.x, max.y - min.y);
       radiusd = fmaxf(val, max.z - min.z) * 0.5f;
 
       // create root node
-      k = nnodesd;
+      k       = nnodesd;
       bottomd = k;
 
       startd[k] = 0;
       float4 p;
-      p.x = (min.x + max.x) * 0.5f;
-      p.y = (min.y + max.y) * 0.5f;
-      p.z = (min.z + max.z) * 0.5f;
-      p.w = -1.0f;
+      p.x         = (min.x + max.x) * 0.5f;
+      p.y         = (min.y + max.y) * 0.5f;
+      p.z         = (min.z + max.z) * 0.5f;
+      p.w         = -1.0f;
       posMassd[k] = p;
       k *= 8;
-      for (i = 0; i < 8; i++) childd[k + i] = -1;
+      for (i = 0; i < 8; i++)
+        childd[k + i] = -1;
 
       stepd++;
     }
   }
 }
 
-
 /******************************************************************************/
 /*** build tree ***************************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(1024, 1)
-void ClearKernel1(const int nnodesd, const int nbodiesd, int* const __restrict__ childd)
+__global__ __launch_bounds__(1024, 1) void ClearKernel1(const int nnodesd,
+                                                        const int nbodiesd,
+                                                        int* const __restrict__ childd)
 {
   int k, inc, top, bottom;
 
-  top = 8 * nnodesd;
+  top    = 8 * nnodesd;
   bottom = 8 * nbodiesd;
-  inc = blockDim.x * gridDim.x;
-  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
+  inc    = blockDim.x * gridDim.x;
+  k      = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
   if (k < bottom) k += inc;
 
   // iterate over all cells assigned to thread
@@ -204,10 +207,11 @@ void ClearKernel1(const int nnodesd, const int nbodiesd, int* const __restrict__
   }
 }
 
-
-__global__
-__launch_bounds__(THREADS2, FACTOR2)
-void TreeBuildingKernel(const int nnodesd, const int nbodiesd, volatile int* const __restrict__ childd, const float4* const __restrict__ posMassd)
+__global__ __launch_bounds__(THREADS2, FACTOR2) void TreeBuildingKernel(
+  const int nnodesd,
+  const int nbodiesd,
+  volatile int* const __restrict__ childd,
+  const float4* const __restrict__ posMassd)
 {
   int i, j, depth, skip, inc;
   float x, y, z, r;
@@ -216,96 +220,127 @@ void TreeBuildingKernel(const int nnodesd, const int nbodiesd, volatile int* con
   float radius;
 
   // cache root data
-  radius = radiusd * 0.5f;
+  radius            = radiusd * 0.5f;
   const float4 root = posMassd[nnodesd];
 
   skip = 1;
-  inc = blockDim.x * gridDim.x;
-  i = threadIdx.x + blockIdx.x * blockDim.x;
+  inc  = blockDim.x * gridDim.x;
+  i    = threadIdx.x + blockIdx.x * blockDim.x;
 
   // iterate over all bodies assigned to thread
   while (i < nbodiesd) {
     const float4 p = posMassd[i];
     if (skip != 0) {
       // new body, so start traversing at root
-      skip = 0;
-      n = nnodesd;
+      skip  = 0;
+      n     = nnodesd;
       depth = 1;
-      r = radius;
+      r     = radius;
       dx = dy = dz = -r;
-      j = 0;
+      j            = 0;
       // determine which child to follow
-      if (root.x < p.x) {j = 1; dx = r;}
-      if (root.y < p.y) {j |= 2; dy = r;}
-      if (root.z < p.z) {j |= 4; dz = r;}
+      if (root.x < p.x) {
+        j  = 1;
+        dx = r;
+      }
+      if (root.y < p.y) {
+        j |= 2;
+        dy = r;
+      }
+      if (root.z < p.z) {
+        j |= 4;
+        dz = r;
+      }
       x = root.x + dx;
       y = root.y + dy;
       z = root.z + dz;
     }
 
     // follow path to leaf cell
-    ch = childd[n*8+j];
+    ch = childd[n * 8 + j];
     while (ch >= nbodiesd) {
       n = ch;
       depth++;
       r *= 0.5f;
       dx = dy = dz = -r;
-      j = 0;
+      j            = 0;
       // determine which child to follow
-      if (x < p.x) {j = 1; dx = r;}
-      if (y < p.y) {j |= 2; dy = r;}
-      if (z < p.z) {j |= 4; dz = r;}
+      if (x < p.x) {
+        j  = 1;
+        dx = r;
+      }
+      if (y < p.y) {
+        j |= 2;
+        dy = r;
+      }
+      if (z < p.z) {
+        j |= 4;
+        dz = r;
+      }
       x += dx;
       y += dy;
       z += dz;
-      ch = childd[n*8+j];
+      ch = childd[n * 8 + j];
     }
 
     if (ch != -2) {  // skip if child pointer is locked and try again later
-      locked = n*8+j;
+      locked = n * 8 + j;
       if (ch == -1) {
         if (-1 == atomicCAS((int*)&childd[locked], -1, i)) {  // if null, just insert the new body
-          i += inc;  // move on to next body
+          i += inc;                                           // move on to next body
           skip = 1;
         }
       } else {  // there already is a body at this position
         if (ch == atomicCAS((int*)&childd[locked], ch, -2)) {  // try to lock
-          patch = -1;
+          patch            = -1;
           const float4 chp = posMassd[ch];
           // create new cell(s) and insert the old and new bodies
           do {
             depth++;
-            if (depth > MAXDEPTH) {printf("ERROR: maximum depth exceeded (bodies are too close together)\n"); asm("trap;");}
+            if (depth > MAXDEPTH) {
+              printf("ERROR: maximum depth exceeded (bodies are too close together)\n");
+              asm("trap;");
+            }
 
             cell = atomicSub((int*)&bottomd, 1) - 1;
-            if (cell <= nbodiesd) {printf("ERROR: out of cell memory\n"); asm("trap;");}
-
-            if (patch != -1) {
-              childd[n*8+j] = cell;
+            if (cell <= nbodiesd) {
+              printf("ERROR: out of cell memory\n");
+              asm("trap;");
             }
+
+            if (patch != -1) { childd[n * 8 + j] = cell; }
             patch = max(patch, cell);
 
             j = 0;
             if (x < chp.x) j = 1;
             if (y < chp.y) j |= 2;
             if (z < chp.z) j |= 4;
-            childd[cell*8+j] = ch;
+            childd[cell * 8 + j] = ch;
 
             n = cell;
             r *= 0.5f;
             dx = dy = dz = -r;
-            j = 0;
-            if (x < p.x) {j = 1; dx = r;}
-            if (y < p.y) {j |= 2; dy = r;}
-            if (z < p.z) {j |= 4; dz = r;}
+            j            = 0;
+            if (x < p.x) {
+              j  = 1;
+              dx = r;
+            }
+            if (y < p.y) {
+              j |= 2;
+              dy = r;
+            }
+            if (z < p.z) {
+              j |= 4;
+              dz = r;
+            }
             x += dx;
             y += dy;
             z += dz;
 
-            ch = childd[n*8+j];
+            ch = childd[n * 8 + j];
             // repeat until the two bodies are different children
           } while (ch >= 0);
-          childd[n*8+j] = i;
+          childd[n * 8 + j] = i;
 
           i += inc;  // move on to next body
           skip = 2;
@@ -315,40 +350,39 @@ void TreeBuildingKernel(const int nnodesd, const int nbodiesd, volatile int* con
     __syncthreads();  // optional barrier for performance
     __threadfence();
 
-    if (skip == 2) {
-      childd[locked] = patch;
-    }
+    if (skip == 2) { childd[locked] = patch; }
   }
 }
 
-
-__global__
-__launch_bounds__(1024, 1)
-void ClearKernel2(const int nnodesd, int* const __restrict__ startd, float4* const __restrict__ posMassd)
+__global__ __launch_bounds__(1024, 1) void ClearKernel2(const int nnodesd,
+                                                        int* const __restrict__ startd,
+                                                        float4* const __restrict__ posMassd)
 {
   int k, inc, bottom;
 
   bottom = bottomd;
-  inc = blockDim.x * gridDim.x;
-  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
+  inc    = blockDim.x * gridDim.x;
+  k      = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
   if (k < bottom) k += inc;
 
   // iterate over all cells assigned to thread
   while (k < nnodesd) {
     posMassd[k].w = -1.0f;
-    startd[k] = -1;
+    startd[k]     = -1;
     k += inc;
   }
 }
 
-
 /******************************************************************************/
 /*** compute center of mass ***************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(THREADS3, FACTOR3)
-void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* const __restrict__ countd, const int* const __restrict__ childd, volatile float4* const __restrict__ posMassd)
+__global__ __launch_bounds__(THREADS3, FACTOR3) void SummarizationKernel(
+  const int nnodesd,
+  const int nbodiesd,
+  volatile int* const __restrict__ countd,
+  const int* const __restrict__ childd,
+  volatile float4* const __restrict__ posMassd)
 {
   int i, j, k, ch, inc, cnt, bottom;
   float m, cm, px, py, pz;
@@ -356,8 +390,8 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
   __shared__ float mass[THREADS3 * 8];
 
   bottom = bottomd;
-  inc = blockDim.x * gridDim.x;
-  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
+  inc    = blockDim.x * gridDim.x;
+  k      = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
   if (k < bottom) k += inc;
 
   int restart = k;
@@ -366,21 +400,21 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
     while (k <= nnodesd) {
       if (posMassd[k].w < 0.0f) {
         for (i = 0; i < 8; i++) {
-          ch = childd[k*8+i];
-          child[i*THREADS3+threadIdx.x] = ch;  // cache children
-          if ((ch >= nbodiesd) && ((mass[i*THREADS3+threadIdx.x] = posMassd[ch].w) < 0.0f)) {
+          ch                                = childd[k * 8 + i];
+          child[i * THREADS3 + threadIdx.x] = ch;  // cache children
+          if ((ch >= nbodiesd) && ((mass[i * THREADS3 + threadIdx.x] = posMassd[ch].w) < 0.0f)) {
             break;
           }
         }
         if (i == 8) {
           // all children are ready
-          cm = 0.0f;
-          px = 0.0f;
-          py = 0.0f;
-          pz = 0.0f;
+          cm  = 0.0f;
+          px  = 0.0f;
+          py  = 0.0f;
+          pz  = 0.0f;
           cnt = 0;
           for (i = 0; i < 8; i++) {
-            ch = child[i*THREADS3+threadIdx.x];
+            ch = child[i * THREADS3 + threadIdx.x];
             if (ch >= 0) {
               // four reads due to missing copy constructor for "volatile float4"
               const float chx = posMassd[ch].x;
@@ -388,7 +422,7 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
               const float chz = posMassd[ch].z;
               const float chw = posMassd[ch].w;
               if (ch >= nbodiesd) {  // count bodies (needed later)
-                m = mass[i*THREADS3+threadIdx.x];
+                m = mass[i * THREADS3 + threadIdx.x];
                 cnt += countd[ch];
               } else {
                 m = chw;
@@ -402,7 +436,7 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
             }
           }
           countd[k] = cnt;
-          m = 1.0f / cm;
+          m         = 1.0f / cm;
           // four writes due to missing copy constructor for "volatile float4"
           posMassd[k].x = px * m;
           posMassd[k].y = py * m;
@@ -425,17 +459,18 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
       if (j == 0) {
         j = 8;
         for (i = 0; i < 8; i++) {
-          ch = childd[k*8+i];
-          child[i*THREADS3+threadIdx.x] = ch;  // cache children
-          if ((ch < nbodiesd) || ((mass[i*THREADS3+threadIdx.x] = posMassd[ch].w) >= 0.0f)) {
+          ch                                = childd[k * 8 + i];
+          child[i * THREADS3 + threadIdx.x] = ch;  // cache children
+          if ((ch < nbodiesd) || ((mass[i * THREADS3 + threadIdx.x] = posMassd[ch].w) >= 0.0f)) {
             j--;
           }
         }
       } else {
         j = 8;
         for (i = 0; i < 8; i++) {
-          ch = child[i*THREADS3+threadIdx.x];
-          if ((ch < nbodiesd) || (mass[i*THREADS3+threadIdx.x] >= 0.0f) || ((mass[i*THREADS3+threadIdx.x] = posMassd[ch].w) >= 0.0f)) {
+          ch = child[i * THREADS3 + threadIdx.x];
+          if ((ch < nbodiesd) || (mass[i * THREADS3 + threadIdx.x] >= 0.0f) ||
+              ((mass[i * THREADS3 + threadIdx.x] = posMassd[ch].w) >= 0.0f)) {
             j--;
           }
         }
@@ -443,13 +478,13 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
 
       if (j == 0) {
         // all children are ready
-        cm = 0.0f;
-        px = 0.0f;
-        py = 0.0f;
-        pz = 0.0f;
+        cm  = 0.0f;
+        px  = 0.0f;
+        py  = 0.0f;
+        pz  = 0.0f;
         cnt = 0;
         for (i = 0; i < 8; i++) {
-          ch = child[i*THREADS3+threadIdx.x];
+          ch = child[i * THREADS3 + threadIdx.x];
           if (ch >= 0) {
             // four reads due to missing copy constructor for "volatile float4"
             const float chx = posMassd[ch].x;
@@ -457,7 +492,7 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
             const float chz = posMassd[ch].z;
             const float chw = posMassd[ch].w;
             if (ch >= nbodiesd) {  // count bodies (needed later)
-              m = mass[i*THREADS3+threadIdx.x];
+              m = mass[i * THREADS3 + threadIdx.x];
               cnt += countd[ch];
             } else {
               m = chw;
@@ -471,7 +506,7 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
           }
         }
         countd[k] = cnt;
-        m = 1.0f / cm;
+        m         = 1.0f / cm;
         // four writes due to missing copy constructor for "volatile float4"
         posMassd[k].x = px * m;
         posMassd[k].y = py * m;
@@ -484,20 +519,23 @@ void SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int* co
   }
 }
 
-
 /******************************************************************************/
 /*** sort bodies **************************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(THREADS4, FACTOR4)
-void SortKernel(const int nnodesd, const int nbodiesd, int* const __restrict__ sortd, const int* const __restrict__ countd, volatile int* const __restrict__ startd, int* const __restrict__ childd)
+__global__ __launch_bounds__(THREADS4,
+                             FACTOR4) void SortKernel(const int nnodesd,
+                                                      const int nbodiesd,
+                                                      int* const __restrict__ sortd,
+                                                      const int* const __restrict__ countd,
+                                                      volatile int* const __restrict__ startd,
+                                                      int* const __restrict__ childd)
 {
   int i, j, k, ch, dec, start, bottom;
 
   bottom = bottomd;
-  dec = blockDim.x * gridDim.x;
-  k = nnodesd + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x;
+  dec    = blockDim.x * gridDim.x;
+  k      = nnodesd + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x;
 
   // iterate over all cells assigned to thread
   while (k >= bottom) {
@@ -505,17 +543,17 @@ void SortKernel(const int nnodesd, const int nbodiesd, int* const __restrict__ s
     if (start >= 0) {
       j = 0;
       for (i = 0; i < 8; i++) {
-        ch = childd[k*8+i];
+        ch = childd[k * 8 + i];
         if (ch >= 0) {
           if (i != j) {
             // move children to front (needed later for speed)
-            childd[k*8+i] = -1;
-            childd[k*8+j] = ch;
+            childd[k * 8 + i] = -1;
+            childd[k * 8 + j] = ch;
           }
           j++;
           if (ch >= nbodiesd) {
             // child is a cell
-            startd[ch] = start;  // set start ID of child
+            startd[ch] = start;   // set start ID of child
             start += countd[ch];  // add #bodies in subtree
           } else {
             // child is a body
@@ -524,25 +562,32 @@ void SortKernel(const int nnodesd, const int nbodiesd, int* const __restrict__ s
           }
         }
       }
-      k -= dec;  // move on to next cell
+      k -= dec;       // move on to next cell
     }
     __syncthreads();  // optional barrier for performance
   }
 }
 
-
 /******************************************************************************/
 /*** compute force ************************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(THREADS5, FACTOR5)
-void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float dthfd, const float itolsqd, const float epssqd, const int* const __restrict__ sortd, const int* const __restrict__ childd, const float4* const __restrict__ posMassd, float2* const __restrict__ veld, float4* const __restrict__ accVeld)
+__global__ __launch_bounds__(THREADS5, FACTOR5) void ForceCalculationKernel(
+  const int nnodesd,
+  const int nbodiesd,
+  const float dthfd,
+  const float itolsqd,
+  const float epssqd,
+  const int* const __restrict__ sortd,
+  const int* const __restrict__ childd,
+  const float4* const __restrict__ posMassd,
+  float2* const __restrict__ veld,
+  float4* const __restrict__ accVeld)
 {
   int i, j, k, n, depth, base, sbase, diff, pd, nd;
   float ax, ay, az, dx, dy, dz, tmp;
-  __shared__ volatile int pos[MAXDEPTH * THREADS5/WARPSIZE], node[MAXDEPTH * THREADS5/WARPSIZE];
-  __shared__ float dq[MAXDEPTH * THREADS5/WARPSIZE];
+  __shared__ volatile int pos[MAXDEPTH * THREADS5 / WARPSIZE], node[MAXDEPTH * THREADS5 / WARPSIZE];
+  __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
 
   if (0 == threadIdx.x) {
     tmp = radiusd * 2;
@@ -557,15 +602,13 @@ void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float d
   __syncthreads();
 
   // figure out first thread in each warp (lane 0)
-  base = threadIdx.x / WARPSIZE;
+  base  = threadIdx.x / WARPSIZE;
   sbase = base * WARPSIZE;
-  j = base * MAXDEPTH;
+  j     = base * MAXDEPTH;
 
   diff = threadIdx.x - sbase;
   // make multiple copies to avoid index calculations later
-  if (diff < MAXDEPTH) {
-    dq[diff+j] = dq[diff];
-  }
+  if (diff < MAXDEPTH) { dq[diff + j] = dq[diff]; }
   __syncthreads();
 
   // iterate over all bodies assigned to thread
@@ -581,7 +624,7 @@ void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float d
     // initialize iteration stack, i.e., push root node onto stack
     depth = j;
     if (sbase == threadIdx.x) {
-      pos[j] = 0;
+      pos[j]  = 0;
       node[j] = nnodesd * 8;
     }
 
@@ -596,12 +639,15 @@ void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float d
 
         if (n >= 0) {
           const float4 pn = posMassd[n];
-          dx = pn.x - pi.x;
-          dy = pn.y - pi.y;
-          dz = pn.z - pi.z;
-          tmp = dx*dx + (dy*dy + (dz*dz + epssqd));  // compute distance squared (plus softening)
-          if ((n < nbodiesd) || __all_sync(0xffffffff, tmp >= dq[depth])) {  // check if all threads agree that cell is far enough away (or is a body)
-            tmp = rsqrtf(tmp);  // compute distance
+          dx              = pn.x - pi.x;
+          dy              = pn.y - pi.y;
+          dz              = pn.z - pi.z;
+          tmp =
+            dx * dx + (dy * dy + (dz * dz + epssqd));  // compute distance squared (plus softening)
+          if ((n < nbodiesd) ||
+              __all_sync(0xffffffff, tmp >= dq[depth])) {  // check if all threads agree that cell
+                                                           // is far enough away (or is a body)
+            tmp = rsqrtf(tmp);                             // compute distance
             tmp = pn.w * tmp * tmp * tmp;
             ax += dx * tmp;
             ay += dy * tmp;
@@ -609,7 +655,7 @@ void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float d
           } else {
             // push cell onto stack
             if (sbase == threadIdx.x) {
-              pos[depth] = pd;
+              pos[depth]  = pd;
               node[depth] = nd;
             }
             depth++;
@@ -634,21 +680,24 @@ void ForceCalculationKernel(const int nnodesd, const int nbodiesd, const float d
     }
 
     // save computed acceleration
-    acc.x = ax;
-    acc.y = ay;
-    acc.z = az;
+    acc.x      = ax;
+    acc.y      = ay;
+    acc.z      = az;
     accVeld[i] = acc;
   }
 }
 
-
 /******************************************************************************/
 /*** advance bodies ***********************************************************/
 /******************************************************************************/
 
-__global__
-__launch_bounds__(THREADS6, FACTOR6)
-void IntegrationKernel(const int nbodiesd, const float dtimed, const float dthfd, float4* const __restrict__ posMass, float2* const __restrict__ veld, float4* const __restrict__ accVeld)
+__global__ __launch_bounds__(THREADS6,
+                             FACTOR6) void IntegrationKernel(const int nbodiesd,
+                                                             const float dtimed,
+                                                             const float dthfd,
+                                                             float4* const __restrict__ posMass,
+                                                             float2* const __restrict__ veld,
+                                                             float4* const __restrict__ accVeld)
 {
   int i, inc;
   float dvelx, dvely, dvelz;
@@ -659,14 +708,14 @@ void IntegrationKernel(const int nbodiesd, const float dtimed, const float dthfd
   for (i = threadIdx.x + blockIdx.x * blockDim.x; i < nbodiesd; i += inc) {
     // integrate
     float4 acc = accVeld[i];
-    dvelx = acc.x * dthfd;
-    dvely = acc.y * dthfd;
-    dvelz = acc.z * dthfd;
+    dvelx      = acc.x * dthfd;
+    dvely      = acc.y * dthfd;
+    dvelz      = acc.z * dthfd;
 
     float2 v = veld[i];
-    velhx = v.x + dvelx;
-    velhy = v.y + dvely;
-    velhz = acc.w + dvelz;
+    velhx    = v.x + dvelx;
+    velhy    = v.y + dvely;
+    velhz    = acc.w + dvelz;
 
     float4 p = posMass[i];
     p.x += velhx * dtimed;
@@ -674,10 +723,10 @@ void IntegrationKernel(const int nbodiesd, const float dtimed, const float dthfd
     p.z += velhz * dtimed;
     posMass[i] = p;
 
-    v.x = velhx + dvelx;
-    v.y = velhy + dvely;
-    acc.w = velhz + dvelz;
-    veld[i] = v;
+    v.x        = velhx + dvelx;
+    v.y        = velhy + dvely;
+    acc.w      = velhz + dvelz;
+    veld[i]    = v;
     accVeld[i] = acc;
   }
 }
@@ -696,22 +745,20 @@ static void CudaTest(const char* const msg)
   }
 }
 
-
 /******************************************************************************/
 
-// random number generator (based on SPLASH-2 code at https://github.com/staceyson/splash2/blob/master/codes/apps/barnes/util.C)
+// random number generator (based on SPLASH-2 code at
+// https://github.com/staceyson/splash2/blob/master/codes/apps/barnes/util.C)
 
 static int randx = 7;
 
-
 static double drnd()
 {
-   const int lastrand = randx;
-   randx = (1103515245 * randx + 12345) & 0x7FFFFFFF;
-   return (double)lastrand / 2147483648.0;
+  const int lastrand = randx;
+  randx              = (1103515245 * randx + 12345) & 0x7FFFFFFF;
+  return (double)lastrand / 2147483648.0;
 }
 
-
 /******************************************************************************/
 
 int main(int argc, char* argv[])
@@ -723,14 +770,14 @@ int main(int argc, char* argv[])
   float time, timing[7];
   cudaEvent_t start, stop;
 
-  float4 *accVel;
-  float2 *vel;
+  float4* accVel;
+  float2* vel;
   int *sortl, *childl, *countl, *startl;
-  float4 *accVell;
-  float2 *vell;
+  float4* accVell;
+  float2* vell;
   float3 *maxl, *minl;
-  float4 *posMassl;
-  float4 *posMass;
+  float4* posMassl;
+  float4* posMass;
   double rsc, vsc, r, v, x, y, z, sq, scale;
 
   // perform some checks
@@ -774,11 +821,16 @@ int main(int argc, char* argv[])
     exit(-1);
   }
 
-  blocks = deviceProp.multiProcessorCount;
+  blocks         = deviceProp.multiProcessorCount;
   const int mTSM = deviceProp.maxThreadsPerMultiProcessor;
-  printf("gpu: %s with %d SMs and %d mTpSM (%.1f MHz and %.1f MHz)\n", deviceProp.name, blocks, mTSM, deviceProp.clockRate * 0.001, deviceProp.memoryClockRate * 0.001);
-
-  if ((WARPSIZE <= 0) || (WARPSIZE & (WARPSIZE-1) != 0)) {
+  printf("gpu: %s with %d SMs and %d mTpSM (%.1f MHz and %.1f MHz)\n",
+         deviceProp.name,
+         blocks,
+         mTSM,
+         deviceProp.clockRate * 0.001,
+         deviceProp.memoryClockRate * 0.001);
+
+  if ((WARPSIZE <= 0) || (WARPSIZE & (WARPSIZE - 1) != 0)) {
     fprintf(stderr, "Warp size must be greater than zero and a power of two\n");
     exit(-1);
   }
@@ -786,7 +838,7 @@ int main(int argc, char* argv[])
     fprintf(stderr, "MAXDEPTH must be less than or equal to WARPSIZE\n");
     exit(-1);
   }
-  if ((THREADS1 <= 0) || (THREADS1 & (THREADS1-1) != 0)) {
+  if ((THREADS1 <= 0) || (THREADS1 & (THREADS1 - 1) != 0)) {
     fprintf(stderr, "THREADS1 must be greater than zero and a power of two\n");
     exit(-1);
   }
@@ -801,9 +853,10 @@ int main(int argc, char* argv[])
   cudaFuncSetCacheConfig(ForceCalculationKernel, cudaFuncCachePreferEqual);
   cudaFuncSetCacheConfig(IntegrationKernel, cudaFuncCachePreferL1);
 
-  cudaGetLastError();  // reset error value
+  cudaGetLastError();              // reset error value
   for (run = 0; run < 1; run++) {  // in case multiple runs are desired for timing
-    for (i = 0; i < 7; i++) timing[i] = 0.0f;
+    for (i = 0; i < 7; i++)
+      timing[i] = 0.0f;
 
     nbodies = atoi(argv[1]);
     if (nbodies < 1) {
@@ -815,14 +868,16 @@ int main(int argc, char* argv[])
       exit(-1);
     }
     nnodes = nbodies * 2;
-    if (nnodes < 1024*blocks) nnodes = 1024*blocks;
-    while ((nnodes & (WARPSIZE-1)) != 0) nnodes++;
+    if (nnodes < 1024 * blocks) nnodes = 1024 * blocks;
+    while ((nnodes & (WARPSIZE - 1)) != 0)
+      nnodes++;
     nnodes--;
 
     timesteps = atoi(argv[2]);
-    dtime = 0.025;  dthf = dtime * 0.5f;
-    epssq = 0.05 * 0.05;
-    itolsq = 1.0f / (0.5 * 0.5);
+    dtime     = 0.025;
+    dthf      = dtime * 0.5f;
+    epssq     = 0.05 * 0.05;
+    itolsq    = 1.0f / (0.5 * 0.5);
 
     // allocate memory
 
@@ -830,124 +885,181 @@ int main(int argc, char* argv[])
       printf("configuration: %d bodies, %d time steps\n", nbodies, timesteps);
 
       accVel = (float4*)malloc(sizeof(float4) * nbodies);
-      if (accVel == NULL) {fprintf(stderr, "cannot allocate accVel\n");  exit(-1);}
+      if (accVel == NULL) {
+        fprintf(stderr, "cannot allocate accVel\n");
+        exit(-1);
+      }
       vel = (float2*)malloc(sizeof(float2) * nbodies);
-      if (vel == NULL) {fprintf(stderr, "cannot allocate vel\n");  exit(-1);}
+      if (vel == NULL) {
+        fprintf(stderr, "cannot allocate vel\n");
+        exit(-1);
+      }
       posMass = (float4*)malloc(sizeof(float4) * nbodies);
-      if (posMass == NULL) {fprintf(stderr, "cannot allocate posMass\n");  exit(-1);}
-
-      if (cudaSuccess != cudaMalloc((void **)&childl, sizeof(int) * (nnodes+1) * 8)) fprintf(stderr, "could not allocate childd\n");  CudaTest("couldn't allocate childd");
-      if (cudaSuccess != cudaMalloc((void **)&vell, sizeof(float2) * (nnodes+1))) fprintf(stderr, "could not allocate veld\n");  CudaTest("couldn't allocate veld");
-      if (cudaSuccess != cudaMalloc((void **)&accVell, sizeof(float4) * (nnodes+1))) fprintf(stderr, "could not allocate accVeld\n");  CudaTest("couldn't allocate accVeld");
-      if (cudaSuccess != cudaMalloc((void **)&countl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate countd\n");  CudaTest("couldn't allocate countd");
-      if (cudaSuccess != cudaMalloc((void **)&startl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate startd\n");  CudaTest("couldn't allocate startd");
-      if (cudaSuccess != cudaMalloc((void **)&sortl, sizeof(int) * (nnodes+1))) fprintf(stderr, "could not allocate sortd\n");  CudaTest("couldn't allocate sortd");
-
-      if (cudaSuccess != cudaMalloc((void **)&posMassl, sizeof(float4) * (nnodes+1))) fprintf(stderr, "could not allocate posMassd\n");  CudaTest("couldn't allocate posMassd");
+      if (posMass == NULL) {
+        fprintf(stderr, "cannot allocate posMass\n");
+        exit(-1);
+      }
 
-      if (cudaSuccess != cudaMalloc((void **)&maxl, sizeof(float3) * blocks * FACTOR1)) fprintf(stderr, "could not allocate maxd\n");  CudaTest("couldn't allocate maxd");
-      if (cudaSuccess != cudaMalloc((void **)&minl, sizeof(float3) * blocks * FACTOR1)) fprintf(stderr, "could not allocate mind\n");  CudaTest("couldn't allocate mind");
+      if (cudaSuccess != cudaMalloc((void**)&childl, sizeof(int) * (nnodes + 1) * 8))
+        fprintf(stderr, "could not allocate childd\n");
+      CudaTest("couldn't allocate childd");
+      if (cudaSuccess != cudaMalloc((void**)&vell, sizeof(float2) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate veld\n");
+      CudaTest("couldn't allocate veld");
+      if (cudaSuccess != cudaMalloc((void**)&accVell, sizeof(float4) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate accVeld\n");
+      CudaTest("couldn't allocate accVeld");
+      if (cudaSuccess != cudaMalloc((void**)&countl, sizeof(int) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate countd\n");
+      CudaTest("couldn't allocate countd");
+      if (cudaSuccess != cudaMalloc((void**)&startl, sizeof(int) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate startd\n");
+      CudaTest("couldn't allocate startd");
+      if (cudaSuccess != cudaMalloc((void**)&sortl, sizeof(int) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate sortd\n");
+      CudaTest("couldn't allocate sortd");
+
+      if (cudaSuccess != cudaMalloc((void**)&posMassl, sizeof(float4) * (nnodes + 1)))
+        fprintf(stderr, "could not allocate posMassd\n");
+      CudaTest("couldn't allocate posMassd");
+
+      if (cudaSuccess != cudaMalloc((void**)&maxl, sizeof(float3) * blocks * FACTOR1))
+        fprintf(stderr, "could not allocate maxd\n");
+      CudaTest("couldn't allocate maxd");
+      if (cudaSuccess != cudaMalloc((void**)&minl, sizeof(float3) * blocks * FACTOR1))
+        fprintf(stderr, "could not allocate mind\n");
+      CudaTest("couldn't allocate mind");
     }
 
-    // generate input (based on SPLASH-2 code at https://github.com/staceyson/splash2/blob/master/codes/apps/barnes/code.C)
+    // generate input (based on SPLASH-2 code at
+    // https://github.com/staceyson/splash2/blob/master/codes/apps/barnes/code.C)
 
     rsc = (3 * 3.1415926535897932384626433832795) / 16;
     vsc = sqrt(1.0 / rsc);
     for (i = 0; i < nbodies; i++) {
       float4 p;
       p.w = 1.0 / nbodies;
-      r = 1.0 / sqrt(pow(drnd()*0.999, -2.0/3.0) - 1);
+      r   = 1.0 / sqrt(pow(drnd() * 0.999, -2.0 / 3.0) - 1);
       do {
-        x = drnd()*2.0 - 1.0;
-        y = drnd()*2.0 - 1.0;
-        z = drnd()*2.0 - 1.0;
-        sq = x*x + y*y + z*z;
+        x  = drnd() * 2.0 - 1.0;
+        y  = drnd() * 2.0 - 1.0;
+        z  = drnd() * 2.0 - 1.0;
+        sq = x * x + y * y + z * z;
       } while (sq > 1.0);
-      scale = rsc * r / sqrt(sq);
-      p.x = x * scale;
-      p.y = y * scale;
-      p.z = z * scale;
+      scale      = rsc * r / sqrt(sq);
+      p.x        = x * scale;
+      p.y        = y * scale;
+      p.z        = z * scale;
       posMass[i] = p;
 
       do {
         x = drnd();
         y = drnd() * 0.1;
-      } while (y > x*x * pow(1 - x*x, 3.5));
-      v = x * sqrt(2.0 / sqrt(1 + r*r));
+      } while (y > x * x * pow(1 - x * x, 3.5));
+      v = x * sqrt(2.0 / sqrt(1 + r * r));
       do {
-        x = drnd()*2.0 - 1.0;
-        y = drnd()*2.0 - 1.0;
-        z = drnd()*2.0 - 1.0;
-        sq = x*x + y*y + z*z;
+        x  = drnd() * 2.0 - 1.0;
+        y  = drnd() * 2.0 - 1.0;
+        z  = drnd() * 2.0 - 1.0;
+        sq = x * x + y * y + z * z;
       } while (sq > 1.0);
       scale = vsc * v / sqrt(sq);
       float2 v;
-      v.x = x * scale;
-      v.y = y * scale;
+      v.x         = x * scale;
+      v.y         = y * scale;
       accVel[i].w = z * scale;
-      vel[i] = v;
+      vel[i]      = v;
     }
 
-    if (cudaSuccess != cudaMemcpy(accVell, accVel, sizeof(float4) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of vel to device failed\n");  CudaTest("vel copy to device failed");
-    if (cudaSuccess != cudaMemcpy(vell, vel, sizeof(float2) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of vel to device failed\n");  CudaTest("vel copy to device failed");
-    if (cudaSuccess != cudaMemcpy(posMassl, posMass, sizeof(float4) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of posMass to device failed\n");  CudaTest("posMass copy to device failed");
+    if (cudaSuccess !=
+        cudaMemcpy(accVell, accVel, sizeof(float4) * nbodies, cudaMemcpyHostToDevice))
+      fprintf(stderr, "copying of vel to device failed\n");
+    CudaTest("vel copy to device failed");
+    if (cudaSuccess != cudaMemcpy(vell, vel, sizeof(float2) * nbodies, cudaMemcpyHostToDevice))
+      fprintf(stderr, "copying of vel to device failed\n");
+    CudaTest("vel copy to device failed");
+    if (cudaSuccess !=
+        cudaMemcpy(posMassl, posMass, sizeof(float4) * nbodies, cudaMemcpyHostToDevice))
+      fprintf(stderr, "copying of posMass to device failed\n");
+    CudaTest("posMass copy to device failed");
 
     // run timesteps (launch GPU kernels)
 
-    cudaEventCreate(&start);  cudaEventCreate(&stop);
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
     struct timeval starttime, endtime;
     gettimeofday(&starttime, NULL);
 
     cudaEventRecord(start, 0);
     InitializationKernel<<<1, 1>>>();
-    cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&time, start, stop);
     timing[0] += time;
-    //CudaTest("kernel 0 launch failed");
+    // CudaTest("kernel 0 launch failed");
 
     for (step = 0; step < timesteps; step++) {
       cudaEventRecord(start, 0);
-      BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(nnodes, nbodies, startl, childl, posMassl, maxl, minl);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(
+        nnodes, nbodies, startl, childl, posMassl, maxl, minl);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[1] += time;
-      //CudaTest("kernel 1 launch failed");
+      // CudaTest("kernel 1 launch failed");
 
       cudaEventRecord(start, 0);
       ClearKernel1<<<blocks * 1, 1024>>>(nnodes, nbodies, childl);
       TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>(nnodes, nbodies, childl, posMassl);
       ClearKernel2<<<blocks * 1, 1024>>>(nnodes, startl, posMassl);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[2] += time;
-      //CudaTest("kernel 2 launch failed");
+      // CudaTest("kernel 2 launch failed");
 
       cudaEventRecord(start, 0);
-      SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(nnodes, nbodies, countl, childl, posMassl);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(
+        nnodes, nbodies, countl, childl, posMassl);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[3] += time;
-      //CudaTest("kernel 3 launch failed");
+      // CudaTest("kernel 3 launch failed");
 
       cudaEventRecord(start, 0);
       SortKernel<<<blocks * FACTOR4, THREADS4>>>(nnodes, nbodies, sortl, countl, startl, childl);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[4] += time;
-      //CudaTest("kernel 4 launch failed");
+      // CudaTest("kernel 4 launch failed");
 
       cudaEventRecord(start, 0);
-      ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(nnodes, nbodies, dthf, itolsq, epssq, sortl, childl, posMassl, vell, accVell);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(
+        nnodes, nbodies, dthf, itolsq, epssq, sortl, childl, posMassl, vell, accVell);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[5] += time;
-      //CudaTest("kernel 5 launch failed");
+      // CudaTest("kernel 5 launch failed");
 
       cudaEventRecord(start, 0);
-      IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(nbodies, dtime, dthf, posMassl, vell, accVell);
-      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);
+      IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(
+        nbodies, dtime, dthf, posMassl, vell, accVell);
+      cudaEventRecord(stop, 0);
+      cudaEventSynchronize(stop);
+      cudaEventElapsedTime(&time, start, stop);
       timing[6] += time;
-      //CudaTest("kernel 6 launch failed");
+      // CudaTest("kernel 6 launch failed");
     }
     CudaTest("kernel launch failed");
-    cudaEventDestroy(start);  cudaEventDestroy(stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
 
     gettimeofday(&endtime, NULL);
-    runtime = (endtime.tv_sec + endtime.tv_usec/1000000.0 - starttime.tv_sec - starttime.tv_usec/1000000.0);
+    runtime = (endtime.tv_sec + endtime.tv_usec / 1000000.0 - starttime.tv_sec -
+               starttime.tv_usec / 1000000.0);
 
     printf("runtime: %.4lf s  (", runtime);
     time = 0;
@@ -959,15 +1071,22 @@ int main(int argc, char* argv[])
   }
 
   // transfer final result back to CPU
-  if (cudaSuccess != cudaMemcpy(accVel, accVell, sizeof(float4) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of accVel from device failed\n");  CudaTest("vel copy from device failed");
-  if (cudaSuccess != cudaMemcpy(vel, vell, sizeof(float2) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of vel from device failed\n");  CudaTest("vel copy from device failed");
-  if (cudaSuccess != cudaMemcpy(posMass, posMassl, sizeof(float4) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of posMass from device failed\n");  CudaTest("posMass copy from device failed");
+  if (cudaSuccess != cudaMemcpy(accVel, accVell, sizeof(float4) * nbodies, cudaMemcpyDeviceToHost))
+    fprintf(stderr, "copying of accVel from device failed\n");
+  CudaTest("vel copy from device failed");
+  if (cudaSuccess != cudaMemcpy(vel, vell, sizeof(float2) * nbodies, cudaMemcpyDeviceToHost))
+    fprintf(stderr, "copying of vel from device failed\n");
+  CudaTest("vel copy from device failed");
+  if (cudaSuccess !=
+      cudaMemcpy(posMass, posMassl, sizeof(float4) * nbodies, cudaMemcpyDeviceToHost))
+    fprintf(stderr, "copying of posMass from device failed\n");
+  CudaTest("posMass copy from device failed");
 
   // print output
   i = 0;
-//  for (i = 0; i < nbodies; i++) {
-    printf("%.2e %.2e %.2e\n", posMass[i].x, posMass[i].y, posMass[i].z);
-//  }
+  //  for (i = 0; i < nbodies; i++) {
+  printf("%.2e %.2e %.2e\n", posMass[i].x, posMass[i].y, posMass[i].z);
+  //  }
 
   free(vel);
   free(accVel);
diff --git a/cpp/src/tsne/exact_kernels.cuh b/cpp/src/tsne/exact_kernels.cuh
index e7a92ac2ec..7ccb6e279d 100644
--- a/cpp/src/tsne/exact_kernels.cuh
+++ b/cpp/src/tsne/exact_kernels.cuh
@@ -290,8 +290,8 @@ __global__ void repulsive_kernel(const value_t* restrict Y,
                                  value_t* restrict Z_sum2,
                                  const value_idx n,
                                  const value_idx dim,
-                                 const value_t df_power,  // -(df + 1)/2)
-                                 const value_t recp_df)   // 1 / df
+                                 const value_t df_power,   // -(df + 1)/2)
+                                 const value_t recp_df)    // 1 / df
 {
   const auto j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   const auto i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
diff --git a/cpp/src/umap/fuzzy_simpl_set/naive.cuh b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
index f674b0ba0f..0ac61f05b2 100644
--- a/cpp/src/umap/fuzzy_simpl_set/naive.cuh
+++ b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
@@ -194,7 +194,7 @@ __global__ void compute_membership_strength_kernel(
   const value_idx* knn_indices,
   const float* knn_dists,  // nn outputs
   const value_t* sigmas,
-  const value_t* rhos,  // continuous dists to nearest neighbors
+  const value_t* rhos,     // continuous dists to nearest neighbors
   value_t* vals,
   int* rows,
   int* cols,  // result coo
diff --git a/cpp/src_prims/timeSeries/arima_helpers.cuh b/cpp/src_prims/timeSeries/arima_helpers.cuh
index e34c910e45..f11909f1ff 100644
--- a/cpp/src_prims/timeSeries/arima_helpers.cuh
+++ b/cpp/src_prims/timeSeries/arima_helpers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,7 +158,7 @@ __global__ void _undiff_kernel(DataT* d_fc,
     for (int i = 0; i < num_steps; i++) {
       if (!double_diff) {  // One simple or seasonal difference
         b_fc[i] += _select_read(b_in, n_in, b_fc, i - s0);
-      } else {  // Two differences (simple, seasonal or both)
+      } else {             // Two differences (simple, seasonal or both)
         DataT fc_acc = -_select_read(b_in, n_in, b_fc, i - s0 - s1);
         fc_acc += _select_read(b_in, n_in, b_fc, i - s0);
         fc_acc += _select_read(b_in, n_in, b_fc, i - s1);
diff --git a/cpp/test/c_api/dbscan_api_test.c b/cpp/test/c_api/dbscan_api_test.c
index e44f59a1e7..6b565e0624 100644
--- a/cpp/test/c_api/dbscan_api_test.c
+++ b/cpp/test/c_api/dbscan_api_test.c
@@ -16,12 +16,12 @@
 
 #include <cuml/cluster/dbscan_api.h>
 
-void test_dbscan() {
+void test_dbscan()
+{
+  cumlHandle_t handle  = 0;
+  cumlError_t response = CUML_SUCCESS;
 
-   cumlHandle_t handle = 0;
-   cumlError_t response = CUML_SUCCESS;
+  response = cumlSpDbscanFit(handle, NULL, 0, 1, 1.0f, 2, NULL, NULL, 10, 1);
 
-   response = cumlSpDbscanFit(handle, NULL, 0, 1, 1.0f, 2, NULL, NULL, 10, 1);
-
-   response = cumlDpDbscanFit(handle, NULL, 0, 1, 1.0, 2, NULL, NULL, 10, 1);
+  response = cumlDpDbscanFit(handle, NULL, 0, 1, 1.0, 2, NULL, NULL, 10, 1);
 }
diff --git a/cpp/test/c_api/glm_api_test.c b/cpp/test/c_api/glm_api_test.c
index d7efce77cb..47c9e40099 100644
--- a/cpp/test/c_api/glm_api_test.c
+++ b/cpp/test/c_api/glm_api_test.c
@@ -14,29 +14,26 @@
  * limitations under the License.
  */
 
-#include <cuml/linear_model/qn.h>
 #include <cuml/linear_model/glm_api.h>
+#include <cuml/linear_model/qn.h>
 
-void test_glm() {
-
-   cumlHandle_t handle = 0;
-   cumlError_t response = CUML_SUCCESS;
-   qn_params pams = {
-      .loss = QN_LOSS_UNKNOWN,
-      .penalty_l1 = 0,
-      .penalty_l2 = 1.0,
-      .grad_tol = 1e-4,
-      .change_tol = 1e-5,
-      .max_iter = 1000,
-      .linesearch_max_iter = 50,
-      .lbfgs_memory = 5,
-      .verbose = 0,
-      .fit_intercept = true,
-      .penalty_normalized = true
-   };
-
-   response = cumlSpQnFit(handle, &pams, NULL, NULL, 0, 1, 2, NULL, NULL, NULL, true);
+void test_glm()
+{
+  cumlHandle_t handle  = 0;
+  cumlError_t response = CUML_SUCCESS;
+  qn_params pams       = {.loss                = QN_LOSS_UNKNOWN,
+                          .penalty_l1          = 0,
+                          .penalty_l2          = 1.0,
+                          .grad_tol            = 1e-4,
+                          .change_tol          = 1e-5,
+                          .max_iter            = 1000,
+                          .linesearch_max_iter = 50,
+                          .lbfgs_memory        = 5,
+                          .verbose             = 0,
+                          .fit_intercept       = true,
+                          .penalty_normalized  = true};
 
-   response = cumlDpQnFit(handle, &pams, NULL, NULL, 0, 1, 2, NULL, NULL, NULL, true);
+  response = cumlSpQnFit(handle, &pams, NULL, NULL, 0, 1, 2, NULL, NULL, NULL, true);
 
+  response = cumlDpQnFit(handle, &pams, NULL, NULL, 0, 1, 2, NULL, NULL, NULL, true);
 }
diff --git a/cpp/test/c_api/holtwinters_api_test.c b/cpp/test/c_api/holtwinters_api_test.c
index 6a2e60935b..a8a01a0e46 100644
--- a/cpp/test/c_api/holtwinters_api_test.c
+++ b/cpp/test/c_api/holtwinters_api_test.c
@@ -16,18 +16,20 @@
 
 #include <cuml/tsa/holtwinters_api.h>
 
-void test_holtwinters() {
+void test_holtwinters()
+{
+  cumlHandle_t handle  = 0;
+  cumlError_t response = CUML_SUCCESS;
 
-   cumlHandle_t handle = 0;
-   cumlError_t response = CUML_SUCCESS;
+  response = cumlHoltWinters_buffer_size(0, 1, 2, NULL, NULL, NULL, NULL, NULL, NULL);
 
-   response = cumlHoltWinters_buffer_size(0, 1, 2, NULL, NULL, NULL, NULL, NULL, NULL);
+  response =
+    cumlHoltWintersSp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
 
-   response = cumlHoltWintersSp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
+  response =
+    cumlHoltWintersDp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
 
-   response = cumlHoltWintersDp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
+  response = cumlHoltWintersSp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
 
-   response = cumlHoltWintersSp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
-
-   response = cumlHoltWintersDp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
+  response = cumlHoltWintersDp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
 }
\ No newline at end of file
diff --git a/cpp/test/c_api/knn_api_test.c b/cpp/test/c_api/knn_api_test.c
index e8b8bdf07b..0740ead431 100644
--- a/cpp/test/c_api/knn_api_test.c
+++ b/cpp/test/c_api/knn_api_test.c
@@ -16,10 +16,11 @@
 
 #include <cuml/neighbors/knn_api.h>
 
-void test_knn() {
+void test_knn()
+{
+  cumlHandle_t handle  = 0;
+  cumlError_t response = CUML_SUCCESS;
 
-   cumlHandle_t handle = 0;
-   cumlError_t response = CUML_SUCCESS;
-
-   response = knn_search(handle, NULL, NULL, 1, 2, NULL, 3, NULL, NULL, 4, true, false, 0, 2.0f, false);
+  response =
+    knn_search(handle, NULL, NULL, 1, 2, NULL, 3, NULL, NULL, 4, true, false, 0, 2.0f, false);
 }
\ No newline at end of file
diff --git a/cpp/test/c_api/svm_api_test.c b/cpp/test/c_api/svm_api_test.c
index f7792839f7..5b95bf9148 100644
--- a/cpp/test/c_api/svm_api_test.c
+++ b/cpp/test/c_api/svm_api_test.c
@@ -16,16 +16,60 @@
 
 #include <cuml/svm/svm_api.h>
 
-void test_svm() {
+void test_svm()
+{
+  cumlHandle_t handle  = 0;
+  cumlError_t response = CUML_SUCCESS;
 
-   cumlHandle_t handle = 0;
-   cumlError_t response = CUML_SUCCESS;
+  response = cumlSpSvcFit(handle,
+                          NULL,
+                          0,
+                          1,
+                          NULL,
+                          1.0f,
+                          2.0f,
+                          2,
+                          3,
+                          3.0f,
+                          4,
+                          LINEAR,
+                          5,
+                          6.0f,
+                          7.0f,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL);
 
-   response = cumlSpSvcFit(handle, NULL, 0, 1, NULL, 1.0f, 2.0f, 2, 3, 3.0f, 4, LINEAR, 5, 6.0f, 7.0f, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+  response = cumlDpSvcFit(handle,
+                          NULL,
+                          0,
+                          1,
+                          NULL,
+                          1.0,
+                          2.0,
+                          2,
+                          3,
+                          3.0,
+                          4,
+                          LINEAR,
+                          5,
+                          6.0,
+                          7.0,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL,
+                          NULL);
 
-   response = cumlDpSvcFit(handle, NULL, 0, 1, NULL, 1.0, 2.0, 2, 3, 3.0, 4, LINEAR, 5, 6.0, 7.0, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+  response = cumlSpSvcPredict(
+    handle, NULL, 0, 1, LINEAR, 2, 3.0f, 4.0f, 5, 6.0f, NULL, NULL, 7, NULL, NULL, 8.0f, 9);
 
-   response = cumlSpSvcPredict(handle, NULL, 0, 1, LINEAR, 2, 3.0f, 4.0f, 5, 6.0f, NULL, NULL, 7, NULL, NULL, 8.0f, 9);
-
-   response = cumlDpSvcPredict(handle, NULL, 0, 1, LINEAR, 2, 3.0, 4.0, 5, 6.0, NULL, NULL, 7, NULL, NULL, 8.0, 9);
+  response = cumlDpSvcPredict(
+    handle, NULL, 0, 1, LINEAR, 2, 3.0, 4.0, 5, 6.0, NULL, NULL, 7, NULL, NULL, 8.0, 9);
 }
\ No newline at end of file
diff --git a/cpp/test/prims/batched/csr.cu b/cpp/test/prims/batched/csr.cu
index 7fa547df49..1ca7748a72 100644
--- a/cpp/test/prims/batched/csr.cu
+++ b/cpp/test/prims/batched/csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ template <typename T>
 struct CSRInputs {
   CSROperation operation;
   int batch_size;
-  int m;  // Dimensions of A
+  int m;    // Dimensions of A
   int n;
   int nnz;  // Number of non-zero elements in A
   int p;    // Dimensions of B or x
diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu
index 2ab3eed56e..b9b740f29f 100644
--- a/cpp/test/sg/fil_child_index_test.cu
+++ b/cpp/test/sg/fil_child_index_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -206,7 +206,7 @@ std::vector<ChildIndexTestParams> params = {
   CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = QNAN, correct = 1),  // !def_left
   CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = QNAN, correct = 2),    // !def_left
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1),      // !def_left
+    node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1),         // !def_left
   CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = 0.0, correct = 1),  // val !>= thresh
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(thresh = 0.0), parent_node_idx = 1, input = -INF, correct = 3),
@@ -224,7 +224,7 @@ std::vector<ChildIndexTestParams> params = {
     node = NODE(thresh = 0.0), parent_node_idx = 4, input = -INF, correct = 9),
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(thresh = 0.0), parent_node_idx = 4, input = 0.0, correct = 10),
-  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10),  // !def_left
+  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10),         // !def_left
   CHILD_INDEX_TEST_PARAMS(
     node = NODE(def_left = true), input = QNAN, parent_node_idx = 4, correct = 9),  // !def_left
   // cannot match ( < 0 and realistic fid_num_cats)
diff --git a/cpp/test/sg/lars_test.cu b/cpp/test/sg/lars_test.cu
index 898a56476c..8ebd6497f3 100644
--- a/cpp/test/sg/lars_test.cu
+++ b/cpp/test/sg/lars_test.cu
@@ -663,15 +663,15 @@ class LarsTestFitPredict : public ::testing::Test {
                           3.70155968e-02,
                           0.0740366429090};
   math_t pred_exp[10]  = {-121.32409183,
-                         -170.25278892,
-                         19.26177047,
-                         89.73931476,
-                         100.07545046,
-                         83.71217894,
-                         40.59397899,
-                         -109.19137223,
-                         -72.89633962,
-                         140.28189898};
+                          -170.25278892,
+                          19.26177047,
+                          89.73931476,
+                          100.07545046,
+                          83.71217894,
+                          40.59397899,
+                          -109.19137223,
+                          -72.89633962,
+                          140.28189898};
   int indices_exp[5]   = {2, 1, 3, 4, 0};
 
   rmm::device_uvector<math_t> X;
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index c264108df6..de8ef17010 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -499,11 +499,11 @@ std::vector<int> min_samples_split       = {2, 10};
 std::vector<float> min_impurity_decrease = {0.0f, 1.0f, 10.0f};
 std::vector<int> n_streams               = {1, 2, 10};
 std::vector<CRITERION> split_criterion   = {CRITERION::INVERSE_GAUSSIAN,
-                                          CRITERION::GAMMA,
-                                          CRITERION::POISSON,
-                                          CRITERION::MSE,
-                                          CRITERION::GINI,
-                                          CRITERION::ENTROPY};
+                                            CRITERION::GAMMA,
+                                            CRITERION::POISSON,
+                                            CRITERION::MSE,
+                                            CRITERION::GINI,
+                                            CRITERION::ENTROPY};
 std::vector<int> seed                    = {0, 17};
 std::vector<int> n_labels                = {2, 10, 20};
 std::vector<bool> double_precision       = {false, true};
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index 6533a6a436..0a67385130 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -125,7 +125,7 @@ class TSNETest : public ::testing::TestWithParam<TSNEInput> {
     model_params.n_neighbors   = 90;
     model_params.min_grad_norm = 1e-12;
     model_params.verbosity     = CUML_LEVEL_DEBUG;
-    model_params.metric = DEFAULT_DISTANCE_METRIC;
+    model_params.metric        = DEFAULT_DISTANCE_METRIC;
 
     // Allocate memory
     rmm::device_uvector<float> X_d(n * p, stream);
diff --git a/python/cuml/internals/callbacks_implems.h b/python/cuml/internals/callbacks_implems.h
index 9407a72731..68e6503d5e 100644
--- a/python/cuml/internals/callbacks_implems.h
+++ b/python/cuml/internals/callbacks_implems.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,57 +22,51 @@
 #include <iostream>
 
 namespace ML {
-    namespace  Internals {
+namespace Internals {
 
-        class DefaultGraphBasedDimRedCallback : public GraphBasedDimRedCallback {
-            public:
+class DefaultGraphBasedDimRedCallback : public GraphBasedDimRedCallback {
+ public:
+  PyObject* get_numba_matrix(void* embeddings)
+  {
+    PyObject* pycl = (PyObject*)this->pyCallbackClass;
 
-                PyObject* get_numba_matrix(void* embeddings)
-                {
-                    PyObject* pycl = (PyObject*)this->pyCallbackClass;
-
-                    if (isFloat)
-                    {
-                        return PyObject_CallMethod(pycl,
-                        "get_numba_matrix", "(l(ll)s)", embeddings,
-                        n, n_components, "float32");
-                    } else {
-                        return PyObject_CallMethod(pycl,
-                        "get_numba_matrix", "(l(ll)s)", embeddings,
-                        n, n_components, "float64");
-                    }
-                }
+    if (isFloat) {
+      return PyObject_CallMethod(
+        pycl, "get_numba_matrix", "(l(ll)s)", embeddings, n, n_components, "float32");
+    } else {
+      return PyObject_CallMethod(
+        pycl, "get_numba_matrix", "(l(ll)s)", embeddings, n, n_components, "float64");
+    }
+  }
 
-                void on_preprocess_end(void* embeddings) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(embeddings);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_preprocess_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-                }
+  void on_preprocess_end(void* embeddings) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(embeddings);
+    PyObject* res =
+      PyObject_CallMethod(this->pyCallbackClass, "on_preprocess_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-                void on_epoch_end(void* embeddings) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(embeddings);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_epoch_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-                }
+  void on_epoch_end(void* embeddings) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(embeddings);
+    PyObject* res = PyObject_CallMethod(this->pyCallbackClass, "on_epoch_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-                void on_train_end(void* embeddings) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(embeddings);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_train_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-                }
+  void on_train_end(void* embeddings) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(embeddings);
+    PyObject* res = PyObject_CallMethod(this->pyCallbackClass, "on_train_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-            public:
-                PyObject* pyCallbackClass = nullptr;
-        };
+ public:
+  PyObject* pyCallbackClass = nullptr;
+};
 
-    }
-}
+}  // namespace Internals
+}  // namespace ML

From 5186353eff30b7aa51e927cfc3d524fad1dc6a91 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sun, 16 Apr 2023 12:33:21 -0500
Subject: [PATCH 5/6] Delete run-clang-format.py

---
 cpp/scripts/run-clang-format.py | 156 --------------------------------
 1 file changed, 156 deletions(-)
 delete mode 100755 cpp/scripts/run-clang-format.py

diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
deleted file mode 100755
index b319e32e26..0000000000
--- a/cpp/scripts/run-clang-format.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import sys
-import re
-import os
-import subprocess
-import argparse
-import tempfile
-import shutil
-
-
-EXPECTED_VERSION = "16.0.1"
-VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
-# NOTE: populate this list with more top-level dirs as we add more of them to
-#       to the cuml repo
-DEFAULT_DIRS = ["cpp/bench",
-                "cpp/comms/mpi/include",
-                "cpp/comms/mpi/src",
-                "cpp/comms/std/include",
-                "cpp/comms/std/src",
-                "cpp/include",
-                "cpp/examples",
-                "cpp/src",
-                "cpp/src_prims",
-                "cpp/test"]
-
-
-def parse_args():
-    argparser = argparse.ArgumentParser("Runs clang-format on a project")
-    argparser.add_argument("-dstdir", type=str, default=None,
-                           help="Directory to store the temporary outputs of"
-                           " clang-format. If nothing is passed for this, then"
-                           " a temporary dir will be created using `mkdtemp`")
-    argparser.add_argument("-exe", type=str, default="clang-format",
-                           help="Path to clang-format exe")
-    argparser.add_argument("-inplace", default=False, action="store_true",
-                           help="Replace the source files itself.")
-    argparser.add_argument("-regex", type=str,
-                           default=r"[.](cu|cuh|h|hpp|cpp)$",
-                           help="Regex string to filter in sources")
-    argparser.add_argument("-ignore", type=str, default=r"cannylab/bh[.]cu$",
-                           help="Regex used to ignore files from matched list")
-    argparser.add_argument("-v", dest="verbose", action="store_true",
-                           help="Print verbose messages")
-    argparser.add_argument("dirs", type=str, nargs="*",
-                           help="List of dirs where to find sources")
-    args = argparser.parse_args()
-    args.regex_compiled = re.compile(args.regex)
-    args.ignore_compiled = re.compile(args.ignore)
-    if args.dstdir is None:
-        args.dstdir = tempfile.mkdtemp()
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
-    ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
-    if version is None:
-        raise Exception("Failed to figure out clang-format version!")
-    version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-format exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
-    if len(args.dirs) == 0:
-        args.dirs = DEFAULT_DIRS
-    return args
-
-
-def list_all_src_files(file_regex, ignore_regex, srcdirs, dstdir, inplace):
-    allFiles = []
-    for srcdir in srcdirs:
-        for root, dirs, files in os.walk(srcdir):
-            for f in files:
-                if re.search(file_regex, f):
-                    src = os.path.join(root, f)
-                    if re.search(ignore_regex, src):
-                        continue
-                    if inplace:
-                        _dir = root
-                    else:
-                        _dir = os.path.join(dstdir, root)
-                    dst = os.path.join(_dir, f)
-                    allFiles.append((src, dst))
-    return allFiles
-
-
-def run_clang_format(src, dst, exe, verbose):
-    dstdir = os.path.dirname(dst)
-    if not os.path.exists(dstdir):
-        os.makedirs(dstdir)
-    # run the clang format command itself
-    if src == dst:
-        cmd = "%s -i %s" % (exe, src)
-    else:
-        cmd = "%s %s > %s" % (exe, src, dst)
-    try:
-        subprocess.check_call(cmd, shell=True)
-    except subprocess.CalledProcessError:
-        print("Failed to run clang-format! Maybe your env is not proper?")
-        raise
-    # run the diff to check if there are any formatting issues
-    cmd = "diff -q %s %s >/dev/null" % (src, dst)
-    try:
-        subprocess.check_call(cmd, shell=True)
-        if verbose:
-            print("%s passed" % os.path.basename(src))
-    except subprocess.CalledProcessError:
-        print("%s failed! 'diff %s %s' will show formatting violations!" % \
-              (os.path.basename(src), src, dst))
-        return False
-    return True
-
-
-def main():
-    args = parse_args()
-    # Attempt to making sure that we run this script from root of repo always
-    if not os.path.exists(".git"):
-        print("Error!! This needs to always be run from the root of repo")
-        sys.exit(-1)
-    all_files = list_all_src_files(args.regex_compiled, args.ignore_compiled,
-                                   args.dirs, args.dstdir, args.inplace)
-
-    # Check whether clang-format exists
-    if shutil.which("clang-format") is None:
-        print("clang-format not found. Exiting...")
-        return
-
-    # actual format checker
-    status = True
-    for src, dst in all_files:
-        if not run_clang_format(src, dst, args.exe, args.verbose):
-            status = False
-    if not status:
-        print("clang-format failed! You have 2 options:")
-        print(" 1. Look at formatting differences above and fix them manually")
-        print(" 2. Or run the below command to bulk-fix all these at once")
-        print("Bulk-fix command: ")
-        print("  python cpp/scripts/run-clang-format.py %s -inplace" % \
-              " ".join(sys.argv[1:]))
-        sys.exit(-1)
-    return
-
-
-if __name__ == "__main__":
-    main()

From 3b9520d60311e6f1408b5be4d41262191f7ff227 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 24 Apr 2023 15:37:06 -0500
Subject: [PATCH 6/6] Run clang-format.

---
 cpp/src/hdbscan/detail/soft_clustering.cuh | 55 +++++++++++++++-------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/cpp/src/hdbscan/detail/soft_clustering.cuh b/cpp/src/hdbscan/detail/soft_clustering.cuh
index f3037fb0f9..b895462fef 100644
--- a/cpp/src/hdbscan/detail/soft_clustering.cuh
+++ b/cpp/src/hdbscan/detail/soft_clustering.cuh
@@ -95,19 +95,40 @@ void dist_membership_vector(const raft::handle_t& handle,
       case raft::distance::DistanceType::L2SqrtExpanded:
         raft::distance::
           distance<raft::distance::DistanceType::L2SqrtExpanded, value_t, value_t, value_t, int>(
-          handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    case raft::distance::DistanceType::L1:
-      raft::distance::distance<raft::distance::DistanceType::L1, value_t, value_t, value_t, int>(
-        handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      raft::distance::
-        distance<raft::distance::DistanceType::CosineExpanded, value_t, value_t, value_t, int>(
-          handle, query + batch_offset * n, exemplars_dense.data(), dist.data(), samples_per_batch, n_exemplars, n, true);
-      break;
-    default: RAFT_EXPECTS(false, "Incorrect metric passed!");
-  }
+            handle,
+            query + batch_offset * n,
+            exemplars_dense.data(),
+            dist.data(),
+            samples_per_batch,
+            n_exemplars,
+            n,
+            true);
+        break;
+      case raft::distance::DistanceType::L1:
+        raft::distance::distance<raft::distance::DistanceType::L1, value_t, value_t, value_t, int>(
+          handle,
+          query + batch_offset * n,
+          exemplars_dense.data(),
+          dist.data(),
+          samples_per_batch,
+          n_exemplars,
+          n,
+          true);
+        break;
+      case raft::distance::DistanceType::CosineExpanded:
+        raft::distance::
+          distance<raft::distance::DistanceType::CosineExpanded, value_t, value_t, value_t, int>(
+            handle,
+            query + batch_offset * n,
+            exemplars_dense.data(),
+            dist.data(),
+            samples_per_batch,
+            n_exemplars,
+            n,
+            true);
+        break;
+      default: RAFT_EXPECTS(false, "Incorrect metric passed!");
+    }
 
     // compute the minimum distances to exemplars of each cluster
     value_idx n_elements = samples_per_batch * n_selected_clusters;
@@ -414,7 +435,8 @@ void all_points_membership_vectors(const raft::handle_t& handle,
 
   if (batch_size > m) batch_size = m;
   RAFT_EXPECTS(0 < batch_size && batch_size <= m,
-               "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");
+               "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the "
+               "training data");
 
   auto parents    = condensed_tree.get_parents();
   auto children   = condensed_tree.get_children();
@@ -541,8 +563,9 @@ void membership_vector(const raft::handle_t& handle,
   value_t* lambdas               = condensed_tree.get_lambdas();
 
   if (batch_size > n_prediction_points) batch_size = n_prediction_points;
-  RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points,
-               "Invalid batch_size. batch_size should be > 0 and <= the number of prediction points");
+  RAFT_EXPECTS(
+    0 < batch_size && batch_size <= n_prediction_points,
+    "Invalid batch_size. batch_size should be > 0 and <= the number of prediction points");
 
   rmm::device_uvector<value_t> dist_membership_vec(n_prediction_points * n_selected_clusters,
                                                    stream);