Preparing sparse primitives for movement to RAFT(#3157)

This PR contains the initial steps to move many of the sparse prims API over to raft, including: - [x] Adjusting `MLCommon::Sparse` namespaces to `raft::sparse` - [x] Breaking csr/coo prims into multiple files (e.g. linalg, components, matrix, etc...) - [x] Using RAFT namespaces for raft componentry used within the sparse prims, such as `device_buffer` and `deviceAllocator`. - [x] Use RAFT handle in public API Closes #3106 Authors: - Corey J. Nolet <[email protected]> - Ray Douglass <[email protected]> Approvers: - Divye Gala (@divyegala) - Divye Gala (@divyegala) - John Zedlewski (@JohnZed) URL: #3157
rapidsai · Jan 16, 2021 · d72c54a · d72c54a
1 parent ecd508c
commit d72c54a
Show file tree

Hide file tree

Showing 58 changed files with 3,362 additions and 2,534 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -249,7 +249,7 @@ if(OPENMP_FOUND)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif(OPENMP_FOUND)
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 
 if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14")

diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -24,7 +24,7 @@
 #include "../common.cuh"
 #include "pack.h"
 
-#include <sparse/csr.cuh>
+#include <sparse/convert/csr.cuh>
 
 using namespace thrust;
 
@@ -54,7 +54,7 @@ void launcher(const raft::handle_t &handle, Pack<Index_> data, Index_ batchSize,
   int minPts = data.minPts;
   Index_ *vd = data.vd;
 
-  MLCommon::Sparse::csr_adj_graph_batched<Index_, TPB_X>(
+  raft::sparse::convert::csr_adj_graph_batched<Index_, TPB_X>(
     data.ex_scan, data.N, data.adjnnz, batchSize, data.adj, data.adj_graph,
     stream,
     [core_pts, minPts, vd] __device__(Index_ row, Index_ start_idx,

diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
@@ -140,7 +140,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
   temp += exScanSize;
 
   // Running VertexDeg
-  MLCommon::Sparse::WeakCCState state(xa, fa, m);
+  raft::sparse::WeakCCState state(xa, fa, m);
   MLCommon::device_buffer<Index_> adj_graph(handle.get_device_allocator(),
                                             stream);
 
@@ -190,7 +190,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
     CUML_LOG_DEBUG("--> Computing connected components");
 
     start_time = raft::curTimeMillis();
-    MLCommon::Sparse::weak_cc_batched<Index_, 1024>(
+    raft::sparse::weak_cc_batched<Index_, 1024>(
       labels, ex_scan, adj_graph.data(), curradjlen, N, startVertexId, nPoints,
       &state, stream,
       [core_pts, startVertexId, nPoints] __device__(Index_ global_id) {

diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
@@ -19,7 +19,7 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/neighbors/knn_sparse.hpp>
 
-#include <sparse/knn.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <cusparse_v2.h>
 
@@ -40,7 +40,7 @@ void brute_force_knn(raft::handle_t &handle, const int *idx_indptr,
   cusparseHandle_t cusparse_handle = handle.get_cusparse_handle();
   cudaStream_t stream = handle.get_stream();
 
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     idx_indptr, idx_indices, idx_data, idx_nnz, n_idx_rows, n_idx_cols,
     query_indptr, query_indices, query_data, query_nnz, n_query_rows,
     n_query_cols, output_indices, output_dists, k, cusparse_handle, d_alloc,

diff --git a/cpp/src/spectral/spectral.cu b/cpp/src/spectral/spectral.cu
@@ -17,7 +17,7 @@
 #include <cuml/cuml.hpp>
 #include <sparse/coo.cuh>
 
-#include <sparse/spectral.cuh>
+#include <sparse/linalg/spectral.cuh>
 
 namespace ML {
 
@@ -38,10 +38,8 @@ namespace Spectral {
    */
 void fit_embedding(const raft::handle_t &handle, int *rows, int *cols,
                    float *vals, int nnz, int n, int n_components, float *out) {
-  const auto &impl = handle;
-  MLCommon::Spectral::fit_embedding(
-    impl.get_cusparse_handle(), rows, cols, vals, nnz, n, n_components, out,
-    handle.get_device_allocator(), handle.get_stream());
+  raft::sparse::spectral::fit_embedding(handle, rows, cols, vals, nnz, n,
+                                        n_components, out);
 }
 }  // namespace Spectral
 }  // namespace ML
diff --git a/cpp/src/tsne/distances.cuh b/cpp/src/tsne/distances.cuh
@@ -21,7 +21,8 @@
 #include <raft/linalg/eltwise.cuh>
 #include <selection/knn.cuh>
 #include <sparse/coo.cuh>
-#include <sparse/knn.cuh>
+#include <sparse/linalg/symmetrize.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <cuml/manifold/common.hpp>
 
@@ -82,7 +83,7 @@ template <>
 void get_distances(const raft::handle_t &handle,
                    manifold_sparse_inputs_t<int, float> &input,
                    knn_graph<int, float> &k_graph, cudaStream_t stream) {
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
     input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
     k_graph.knn_indices, k_graph.knn_dists, k_graph.n_neighbors,
@@ -135,17 +136,16 @@ void normalize_distances(const value_idx n, value_t *distances,
  * @param[in] handle: The GPU handle.
  */
 template <typename value_idx, typename value_t, int TPB_X = 32>
-void symmetrize_perplexity(
-  float *P, value_idx *indices, const value_idx n, const int k,
-  const value_t exaggeration,
-  MLCommon::Sparse::COO<value_t, value_idx> *COO_Matrix, cudaStream_t stream,
-  const raft::handle_t &handle) {
+void symmetrize_perplexity(float *P, value_idx *indices, const value_idx n,
+                           const int k, const value_t exaggeration,
+                           raft::sparse::COO<value_t, value_idx> *COO_Matrix,
+                           cudaStream_t stream, const raft::handle_t &handle) {
   // Perform (P + P.T) / P_sum * early_exaggeration
   const value_t div = exaggeration / (2.0f * n);
   raft::linalg::scalarMultiply(P, P, div, n * k, stream);
 
   // Symmetrize to form P + P.T
-  MLCommon::Sparse::from_knn_symmetrize_matrix(
+  raft::sparse::linalg::from_knn_symmetrize_matrix<value_idx, value_t>(
     indices, P, n, k, COO_Matrix, stream, handle.get_device_allocator());
 }
 

diff --git a/cpp/src/tsne/tsne_runner.cuh b/cpp/src/tsne/tsne_runner.cuh
@@ -203,7 +203,7 @@ class TSNE_runner {
   const bool initialize_embeddings;
   bool barnes_hut;
 
-  MLCommon::Sparse::COO<value_t, value_idx> COO_Matrix;
+  raft::sparse::COO<value_t, value_idx> COO_Matrix;
   value_idx n, p;
   value_t *Y;
 };

diff --git a/cpp/src/umap/fuzzy_simpl_set/naive.cuh b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
@@ -23,8 +23,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
+#include <sparse/op/sort.h>
 #include <raft/stats/mean.cuh>
 #include <sparse/coo.cuh>
+#include <sparse/linalg/symmetrize.cuh>
 
 #include <cuda_runtime.h>
 
@@ -276,8 +278,8 @@ void smooth_knn_dist(int n, const value_idx *knn_indices,
  * @param stream cuda stream to use for device operations
  */
 template <int TPB_X, typename value_idx, typename value_t>
-void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
-              int n_neighbors, MLCommon::Sparse::COO<value_t> *out,
+void launcher(int n, const value_idx *knn_indices, const value_t *knn_dists,
+              int n_neighbors, raft::sparse::COO<value_t> *out,
               UMAPParams *params, std::shared_ptr<deviceAllocator> d_alloc,
               cudaStream_t stream) {
   /**
@@ -292,7 +294,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
     n, knn_indices, knn_dists, rhos.data(), sigmas.data(), params, n_neighbors,
     params->local_connectivity, d_alloc, stream);
 
-  MLCommon::Sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);
+  raft::sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);
 
   // check for logging in order to avoid the potentially costly `arr2Str` call!
   if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) {
@@ -329,7 +331,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
    * one via a fuzzy union. (Symmetrize knn graph).
    */
   float set_op_mix_ratio = params->set_op_mix_ratio;
-  MLCommon::Sparse::coo_symmetrize<TPB_X, value_t>(
+  raft::sparse::linalg::coo_symmetrize<TPB_X, value_t>(
     &in, out,
     [set_op_mix_ratio] __device__(int row, int col, value_t result,
                                   value_t transpose) {
@@ -340,7 +342,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
     },
     d_alloc, stream);
 
-  MLCommon::Sparse::coo_sort<value_t>(out, d_alloc, stream);
+  raft::sparse::op::coo_sort<value_t>(out, d_alloc, stream);
 }
 }  // namespace Naive
 }  // namespace FuzzySimplSet

diff --git a/cpp/src/umap/fuzzy_simpl_set/runner.cuh b/cpp/src/umap/fuzzy_simpl_set/runner.cuh
@@ -41,7 +41,7 @@ using namespace ML;
  */
 template <int TPB_X, typename value_idx, typename T>
 void run(int n, const value_idx *knn_indices, const T *knn_dists,
-         int n_neighbors, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
+         int n_neighbors, raft::sparse::COO<T> *coo, UMAPParams *params,
          std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream,
          int algorithm = 0) {
   switch (algorithm) {

diff --git a/cpp/src/umap/init_embed/runner.cuh b/cpp/src/umap/init_embed/runner.cuh
@@ -32,7 +32,7 @@ using namespace ML;
 template <typename value_idx, typename T>
 void run(const raft::handle_t &handle, int n, int d,
          const value_idx *knn_indices, const T *knn_dists,
-         MLCommon::Sparse::COO<float> *coo, UMAPParams *params, T *embedding,
+         raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding,
          cudaStream_t stream, int algo = 0) {
   switch (algo) {
     /**

diff --git a/cpp/src/umap/init_embed/spectral_algo.cuh b/cpp/src/umap/init_embed/spectral_algo.cuh
@@ -43,8 +43,7 @@ using namespace ML;
 template <typename value_idx, typename T>
 void launcher(const raft::handle_t &handle, int n, int d,
               const value_idx *knn_indices, const T *knn_dists,
-              MLCommon::Sparse::COO<float> *coo, UMAPParams *params,
-              T *embedding) {
+              raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding) {
   cudaStream_t stream = handle.get_stream();
 
   ASSERT(n > params->n_components,

diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
@@ -20,7 +20,7 @@
 #include <iostream>
 #include <raft/linalg/unary_op.cuh>
 #include <selection/knn.cuh>
-#include <sparse/knn.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <raft/cudart_utils.h>
 
@@ -85,7 +85,7 @@ void launcher(const raft::handle_t &handle,
               const ML::UMAPParams *params,
               std::shared_ptr<ML::deviceAllocator> d_alloc,
               cudaStream_t stream) {
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     inputsA.indptr, inputsA.indices, inputsA.data, inputsA.nnz, inputsA.n,
     inputsA.d, inputsB.indptr, inputsB.indices, inputsB.data, inputsB.nnz,
     inputsB.n, inputsB.d, out.knn_indices, out.knn_dists, n_neighbors,

diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
@@ -36,8 +36,11 @@
 #include <thrust/scan.h>
 #include <thrust/system/cuda/execution_policy.h>
 
+#include <sparse/op/sort.h>
+#include <sparse/convert/csr.cuh>
 #include <sparse/coo.cuh>
-#include <sparse/csr.cuh>
+#include <sparse/linalg/norm.cuh>
+#include <sparse/op/filter.cuh>
 
 #include <raft/cuda_utils.cuh>
 
@@ -51,7 +54,6 @@ namespace FuzzySimplSetImpl = FuzzySimplSet::Naive;
 namespace SimplSetEmbedImpl = SimplSetEmbed::Algo;
 
 using namespace ML;
-using namespace MLCommon::Sparse;
 
 template <int TPB_X, typename T>
 __global__ void init_transform(int *indices, T *weights, int n,
@@ -126,7 +128,7 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
   CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");
 
   ML::PUSH_RANGE("umap::simplicial_set");
-  COO<value_t> rgraph_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
   FuzzySimplSet::run<TPB_X, value_idx, value_t>(
     inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo,
     params, d_alloc, stream);
@@ -135,8 +137,8 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros from simplicial set
    */
-  COO<value_t> cgraph_coo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
+  raft::sparse::COO<value_t> cgraph_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
                                                      d_alloc, stream);
   ML::POP_RANGE();
 
@@ -209,8 +211,8 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
    * Allocate workspace for fuzzy simplicial set.
    */
   ML::PUSH_RANGE("umap::simplicial_set");
-  COO<value_t> rgraph_coo(d_alloc, stream);
-  COO<value_t> tmp_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> tmp_coo(d_alloc, stream);
 
   /**
    * Run Fuzzy simplicial set
@@ -221,10 +223,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
     &tmp_coo, params, d_alloc, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
                                                      d_alloc, stream);
 
-  COO<value_t> final_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> final_coo(d_alloc, stream);
 
   /**
    * If target metric is 'categorical', perform
@@ -247,10 +249,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros
    */
-  MLCommon::Sparse::coo_sort<value_t>(&final_coo, d_alloc, stream);
+  raft::sparse::op::coo_sort<value_t>(&final_coo, d_alloc, stream);
 
-  COO<value_t> ocoo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
+  raft::sparse::COO<value_t> ocoo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
                                                      stream);
   ML::POP_RANGE();
 
@@ -366,7 +368,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
    * Allocate workspace for fuzzy simplicial set.
    */
 
-  COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n, inputs.n);
+  raft::sparse::COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n,
+                                       inputs.n);
 
   FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
     <<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices, knn_graph.knn_dists,
@@ -378,17 +381,17 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   MLCommon::device_buffer<int> row_ind(d_alloc, stream, inputs.n);
   MLCommon::device_buffer<int> ia(d_alloc, stream, inputs.n);
 
-  MLCommon::Sparse::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
-                                      stream);
-  MLCommon::Sparse::coo_row_count<TPB_X>(&graph_coo, ia.data(), stream);
+  raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
+                                           stream);
+  raft::sparse::linalg::coo_degree<TPB_X>(&graph_coo, ia.data(), stream);
 
   MLCommon::device_buffer<value_t> vals_normed(d_alloc, stream, graph_coo.nnz);
   CUDA_CHECK(cudaMemsetAsync(vals_normed.data(), 0,
                              graph_coo.nnz * sizeof(value_t), stream));
 
   CUML_LOG_DEBUG("Performing L1 normalization");
 
-  MLCommon::Sparse::csr_row_normalize_l1<TPB_X, value_t>(
+  raft::sparse::linalg::csr_row_normalize_l1<TPB_X, value_t>(
     row_ind.data(), graph_coo.vals(), graph_coo.nnz, graph_coo.n_rows,
     vals_normed.data(), stream);
 
@@ -402,7 +405,7 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   CUDA_CHECK(cudaPeekAtLastError());
 
   /**
-   * Go through COO values and set everything that's less than
+   * Go through raft::sparse::COO values and set everything that's less than
    * vals.max() / params->n_epochs to 0.0
    */
   thrust::device_ptr<value_t> d_ptr =
@@ -437,8 +440,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros
    */
-  MLCommon::Sparse::COO<value_t> comp_coo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
+  raft::sparse::COO<value_t> comp_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
                                                      d_alloc, stream);
 
   ML::PUSH_RANGE("umap::optimization");

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -31,6 +31,8 @@
 #include <string>
 #include "optimize_batch_kernel.cuh"
 
+#include <sparse/op/filter.cuh>
+
 #pragma once
 
 namespace UMAPAlgo {
@@ -194,7 +196,7 @@ void optimize_layout(T *head_embedding, int head_n, T *tail_embedding,
  * and their 1-skeletons.
  */
 template <int TPB_X, typename T>
-void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
+void launcher(int m, int n, raft::sparse::COO<T> *in, UMAPParams *params,
               T *embedding, std::shared_ptr<deviceAllocator> d_alloc,
               cudaStream_t stream) {
   int nnz = in->nnz;
@@ -228,8 +230,8 @@ void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
     },
     stream);
 
-  MLCommon::Sparse::COO<T> out(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);
+  raft::sparse::COO<T> out(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);
 
   MLCommon::device_buffer<T> epochs_per_sample(d_alloc, stream, out.nnz);
   CUDA_CHECK(

diff --git a/cpp/src/umap/simpl_set_embed/runner.cuh b/cpp/src/umap/simpl_set_embed/runner.cuh
@@ -28,7 +28,7 @@ namespace SimplSetEmbed {
 using namespace ML;
 
 template <int TPB_X, typename T>
-void run(int m, int n, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
+void run(int m, int n, raft::sparse::COO<T> *coo, UMAPParams *params,
          T *embedding, std::shared_ptr<deviceAllocator> alloc,
          cudaStream_t stream, int algorithm = 0) {
   switch (algorithm) {