diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6898e528a3..d6cf851aef 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -249,7 +249,7 @@ if(OPENMP_FOUND)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif(OPENMP_FOUND)
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 
 if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14")
diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
index f80a827d7c..24a9f3f720 100644
--- a/cpp/src/dbscan/adjgraph/algo.cuh
+++ b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -24,7 +24,7 @@
 #include "../common.cuh"
 #include "pack.h"
 
-#include <sparse/csr.cuh>
+#include <sparse/convert/csr.cuh>
 
 using namespace thrust;
 
@@ -54,7 +54,7 @@ void launcher(const raft::handle_t &handle, Pack<Index_> data, Index_ batchSize,
   int minPts = data.minPts;
   Index_ *vd = data.vd;
 
-  MLCommon::Sparse::csr_adj_graph_batched<Index_, TPB_X>(
+  raft::sparse::convert::csr_adj_graph_batched<Index_, TPB_X>(
     data.ex_scan, data.N, data.adjnnz, batchSize, data.adj, data.adj_graph,
     stream,
     [core_pts, minPts, vd] __device__(Index_ row, Index_ start_idx,
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 1f0a5c7f8d..785b93897e 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -140,7 +140,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
   temp += exScanSize;
 
   // Running VertexDeg
-  MLCommon::Sparse::WeakCCState state(xa, fa, m);
+  raft::sparse::WeakCCState state(xa, fa, m);
   MLCommon::device_buffer<Index_> adj_graph(handle.get_device_allocator(),
                                             stream);
 
@@ -190,7 +190,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
     CUML_LOG_DEBUG("--> Computing connected components");
 
     start_time = raft::curTimeMillis();
-    MLCommon::Sparse::weak_cc_batched<Index_, 1024>(
+    raft::sparse::weak_cc_batched<Index_, 1024>(
       labels, ex_scan, adj_graph.data(), curradjlen, N, startVertexId, nPoints,
       &state, stream,
       [core_pts, startVertexId, nPoints] __device__(Index_ global_id) {
diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index 0a73f54211..bbaf6d9c1c 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -19,7 +19,7 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/neighbors/knn_sparse.hpp>
 
-#include <sparse/knn.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <cusparse_v2.h>
 
@@ -40,7 +40,7 @@ void brute_force_knn(raft::handle_t &handle, const int *idx_indptr,
   cusparseHandle_t cusparse_handle = handle.get_cusparse_handle();
   cudaStream_t stream = handle.get_stream();
 
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     idx_indptr, idx_indices, idx_data, idx_nnz, n_idx_rows, n_idx_cols,
     query_indptr, query_indices, query_data, query_nnz, n_query_rows,
     n_query_cols, output_indices, output_dists, k, cusparse_handle, d_alloc,
diff --git a/cpp/src/spectral/spectral.cu b/cpp/src/spectral/spectral.cu
index ea7f43a075..6db945698b 100644
--- a/cpp/src/spectral/spectral.cu
+++ b/cpp/src/spectral/spectral.cu
@@ -17,7 +17,7 @@
 #include <cuml/cuml.hpp>
 #include <sparse/coo.cuh>
 
-#include <sparse/spectral.cuh>
+#include <sparse/linalg/spectral.cuh>
 
 namespace ML {
 
@@ -38,10 +38,8 @@ namespace Spectral {
    */
 void fit_embedding(const raft::handle_t &handle, int *rows, int *cols,
                    float *vals, int nnz, int n, int n_components, float *out) {
-  const auto &impl = handle;
-  MLCommon::Spectral::fit_embedding(
-    impl.get_cusparse_handle(), rows, cols, vals, nnz, n, n_components, out,
-    handle.get_device_allocator(), handle.get_stream());
+  raft::sparse::spectral::fit_embedding(handle, rows, cols, vals, nnz, n,
+                                        n_components, out);
 }
 }  // namespace Spectral
 }  // namespace ML
diff --git a/cpp/src/tsne/distances.cuh b/cpp/src/tsne/distances.cuh
index 7dc31e20dc..4baa50aaea 100644
--- a/cpp/src/tsne/distances.cuh
+++ b/cpp/src/tsne/distances.cuh
@@ -21,7 +21,8 @@
 #include <raft/linalg/eltwise.cuh>
 #include <selection/knn.cuh>
 #include <sparse/coo.cuh>
-#include <sparse/knn.cuh>
+#include <sparse/linalg/symmetrize.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <cuml/manifold/common.hpp>
 
@@ -82,7 +83,7 @@ template <>
 void get_distances(const raft::handle_t &handle,
                    manifold_sparse_inputs_t<int, float> &input,
                    knn_graph<int, float> &k_graph, cudaStream_t stream) {
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
     input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
     k_graph.knn_indices, k_graph.knn_dists, k_graph.n_neighbors,
@@ -135,17 +136,16 @@ void normalize_distances(const value_idx n, value_t *distances,
  * @param[in] handle: The GPU handle.
  */
 template <typename value_idx, typename value_t, int TPB_X = 32>
-void symmetrize_perplexity(
-  float *P, value_idx *indices, const value_idx n, const int k,
-  const value_t exaggeration,
-  MLCommon::Sparse::COO<value_t, value_idx> *COO_Matrix, cudaStream_t stream,
-  const raft::handle_t &handle) {
+void symmetrize_perplexity(float *P, value_idx *indices, const value_idx n,
+                           const int k, const value_t exaggeration,
+                           raft::sparse::COO<value_t, value_idx> *COO_Matrix,
+                           cudaStream_t stream, const raft::handle_t &handle) {
   // Perform (P + P.T) / P_sum * early_exaggeration
   const value_t div = exaggeration / (2.0f * n);
   raft::linalg::scalarMultiply(P, P, div, n * k, stream);
 
   // Symmetrize to form P + P.T
-  MLCommon::Sparse::from_knn_symmetrize_matrix(
+  raft::sparse::linalg::from_knn_symmetrize_matrix<value_idx, value_t>(
     indices, P, n, k, COO_Matrix, stream, handle.get_device_allocator());
 }
 
diff --git a/cpp/src/tsne/tsne_runner.cuh b/cpp/src/tsne/tsne_runner.cuh
index ddcb66e5fb..cc8eef9daf 100644
--- a/cpp/src/tsne/tsne_runner.cuh
+++ b/cpp/src/tsne/tsne_runner.cuh
@@ -203,7 +203,7 @@ class TSNE_runner {
   const bool initialize_embeddings;
   bool barnes_hut;
 
-  MLCommon::Sparse::COO<value_t, value_idx> COO_Matrix;
+  raft::sparse::COO<value_t, value_idx> COO_Matrix;
   value_idx n, p;
   value_t *Y;
 };
diff --git a/cpp/src/umap/fuzzy_simpl_set/naive.cuh b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
index d40f127524..08d9b27b75 100644
--- a/cpp/src/umap/fuzzy_simpl_set/naive.cuh
+++ b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
@@ -23,8 +23,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
+#include <sparse/op/sort.h>
 #include <raft/stats/mean.cuh>
 #include <sparse/coo.cuh>
+#include <sparse/linalg/symmetrize.cuh>
 
 #include <cuda_runtime.h>
 
@@ -276,8 +278,8 @@ void smooth_knn_dist(int n, const value_idx *knn_indices,
  * @param stream cuda stream to use for device operations
  */
 template <int TPB_X, typename value_idx, typename value_t>
-void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
-              int n_neighbors, MLCommon::Sparse::COO<value_t> *out,
+void launcher(int n, const value_idx *knn_indices, const value_t *knn_dists,
+              int n_neighbors, raft::sparse::COO<value_t> *out,
               UMAPParams *params, std::shared_ptr<deviceAllocator> d_alloc,
               cudaStream_t stream) {
   /**
@@ -292,7 +294,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
     n, knn_indices, knn_dists, rhos.data(), sigmas.data(), params, n_neighbors,
     params->local_connectivity, d_alloc, stream);
 
-  MLCommon::Sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);
+  raft::sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);
 
   // check for logging in order to avoid the potentially costly `arr2Str` call!
   if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) {
@@ -329,7 +331,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
    * one via a fuzzy union. (Symmetrize knn graph).
    */
   float set_op_mix_ratio = params->set_op_mix_ratio;
-  MLCommon::Sparse::coo_symmetrize<TPB_X, value_t>(
+  raft::sparse::linalg::coo_symmetrize<TPB_X, value_t>(
     &in, out,
     [set_op_mix_ratio] __device__(int row, int col, value_t result,
                                   value_t transpose) {
@@ -340,7 +342,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
     },
     d_alloc, stream);
 
-  MLCommon::Sparse::coo_sort<value_t>(out, d_alloc, stream);
+  raft::sparse::op::coo_sort<value_t>(out, d_alloc, stream);
 }
 }  // namespace Naive
 }  // namespace FuzzySimplSet
diff --git a/cpp/src/umap/fuzzy_simpl_set/runner.cuh b/cpp/src/umap/fuzzy_simpl_set/runner.cuh
index aba1bbf883..6836865bac 100644
--- a/cpp/src/umap/fuzzy_simpl_set/runner.cuh
+++ b/cpp/src/umap/fuzzy_simpl_set/runner.cuh
@@ -41,7 +41,7 @@ using namespace ML;
  */
 template <int TPB_X, typename value_idx, typename T>
 void run(int n, const value_idx *knn_indices, const T *knn_dists,
-         int n_neighbors, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
+         int n_neighbors, raft::sparse::COO<T> *coo, UMAPParams *params,
          std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream,
          int algorithm = 0) {
   switch (algorithm) {
diff --git a/cpp/src/umap/init_embed/runner.cuh b/cpp/src/umap/init_embed/runner.cuh
index 5045c8c8af..c3a4dbdaa5 100644
--- a/cpp/src/umap/init_embed/runner.cuh
+++ b/cpp/src/umap/init_embed/runner.cuh
@@ -32,7 +32,7 @@ using namespace ML;
 template <typename value_idx, typename T>
 void run(const raft::handle_t &handle, int n, int d,
          const value_idx *knn_indices, const T *knn_dists,
-         MLCommon::Sparse::COO<float> *coo, UMAPParams *params, T *embedding,
+         raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding,
          cudaStream_t stream, int algo = 0) {
   switch (algo) {
     /**
diff --git a/cpp/src/umap/init_embed/spectral_algo.cuh b/cpp/src/umap/init_embed/spectral_algo.cuh
index 5f6175ee95..3ec13fdd2c 100644
--- a/cpp/src/umap/init_embed/spectral_algo.cuh
+++ b/cpp/src/umap/init_embed/spectral_algo.cuh
@@ -43,8 +43,7 @@ using namespace ML;
 template <typename value_idx, typename T>
 void launcher(const raft::handle_t &handle, int n, int d,
               const value_idx *knn_indices, const T *knn_dists,
-              MLCommon::Sparse::COO<float> *coo, UMAPParams *params,
-              T *embedding) {
+              raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding) {
   cudaStream_t stream = handle.get_stream();
 
   ASSERT(n > params->n_components,
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index 1fd7c15356..d940849a9a 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -20,7 +20,7 @@
 #include <iostream>
 #include <raft/linalg/unary_op.cuh>
 #include <selection/knn.cuh>
-#include <sparse/knn.cuh>
+#include <sparse/selection/knn.cuh>
 
 #include <raft/cudart_utils.h>
 
@@ -85,7 +85,7 @@ void launcher(const raft::handle_t &handle,
               const ML::UMAPParams *params,
               std::shared_ptr<ML::deviceAllocator> d_alloc,
               cudaStream_t stream) {
-  MLCommon::Sparse::Selection::brute_force_knn(
+  raft::sparse::selection::brute_force_knn(
     inputsA.indptr, inputsA.indices, inputsA.data, inputsA.nnz, inputsA.n,
     inputsA.d, inputsB.indptr, inputsB.indices, inputsB.data, inputsB.nnz,
     inputsB.n, inputsB.d, out.knn_indices, out.knn_dists, n_neighbors,
diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
index 6c48521e9d..9a26c6e266 100644
--- a/cpp/src/umap/runner.cuh
+++ b/cpp/src/umap/runner.cuh
@@ -36,8 +36,11 @@
 #include <thrust/scan.h>
 #include <thrust/system/cuda/execution_policy.h>
 
+#include <sparse/op/sort.h>
+#include <sparse/convert/csr.cuh>
 #include <sparse/coo.cuh>
-#include <sparse/csr.cuh>
+#include <sparse/linalg/norm.cuh>
+#include <sparse/op/filter.cuh>
 
 #include <raft/cuda_utils.cuh>
 
@@ -51,7 +54,6 @@ namespace FuzzySimplSetImpl = FuzzySimplSet::Naive;
 namespace SimplSetEmbedImpl = SimplSetEmbed::Algo;
 
 using namespace ML;
-using namespace MLCommon::Sparse;
 
 template <int TPB_X, typename T>
 __global__ void init_transform(int *indices, T *weights, int n,
@@ -126,7 +128,7 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
   CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");
 
   ML::PUSH_RANGE("umap::simplicial_set");
-  COO<value_t> rgraph_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
   FuzzySimplSet::run<TPB_X, value_idx, value_t>(
     inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo,
     params, d_alloc, stream);
@@ -135,8 +137,8 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros from simplicial set
    */
-  COO<value_t> cgraph_coo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
+  raft::sparse::COO<value_t> cgraph_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
                                                      d_alloc, stream);
   ML::POP_RANGE();
 
@@ -209,8 +211,8 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
    * Allocate workspace for fuzzy simplicial set.
    */
   ML::PUSH_RANGE("umap::simplicial_set");
-  COO<value_t> rgraph_coo(d_alloc, stream);
-  COO<value_t> tmp_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> tmp_coo(d_alloc, stream);
 
   /**
    * Run Fuzzy simplicial set
@@ -221,10 +223,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
     &tmp_coo, params, d_alloc, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
                                                      d_alloc, stream);
 
-  COO<value_t> final_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> final_coo(d_alloc, stream);
 
   /**
    * If target metric is 'categorical', perform
@@ -247,10 +249,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros
    */
-  MLCommon::Sparse::coo_sort<value_t>(&final_coo, d_alloc, stream);
+  raft::sparse::op::coo_sort<value_t>(&final_coo, d_alloc, stream);
 
-  COO<value_t> ocoo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
+  raft::sparse::COO<value_t> ocoo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
                                                      stream);
   ML::POP_RANGE();
 
@@ -366,7 +368,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
    * Allocate workspace for fuzzy simplicial set.
    */
 
-  COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n, inputs.n);
+  raft::sparse::COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n,
+                                       inputs.n);
 
   FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
     <<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices, knn_graph.knn_dists,
@@ -378,9 +381,9 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   MLCommon::device_buffer<int> row_ind(d_alloc, stream, inputs.n);
   MLCommon::device_buffer<int> ia(d_alloc, stream, inputs.n);
 
-  MLCommon::Sparse::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
-                                      stream);
-  MLCommon::Sparse::coo_row_count<TPB_X>(&graph_coo, ia.data(), stream);
+  raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
+                                           stream);
+  raft::sparse::linalg::coo_degree<TPB_X>(&graph_coo, ia.data(), stream);
 
   MLCommon::device_buffer<value_t> vals_normed(d_alloc, stream, graph_coo.nnz);
   CUDA_CHECK(cudaMemsetAsync(vals_normed.data(), 0,
@@ -388,7 +391,7 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
 
   CUML_LOG_DEBUG("Performing L1 normalization");
 
-  MLCommon::Sparse::csr_row_normalize_l1<TPB_X, value_t>(
+  raft::sparse::linalg::csr_row_normalize_l1<TPB_X, value_t>(
     row_ind.data(), graph_coo.vals(), graph_coo.nnz, graph_coo.n_rows,
     vals_normed.data(), stream);
 
@@ -402,7 +405,7 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   CUDA_CHECK(cudaPeekAtLastError());
 
   /**
-   * Go through COO values and set everything that's less than
+   * Go through raft::sparse::COO values and set everything that's less than
    * vals.max() / params->n_epochs to 0.0
    */
   thrust::device_ptr<value_t> d_ptr =
@@ -437,8 +440,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
   /**
    * Remove zeros
    */
-  MLCommon::Sparse::COO<value_t> comp_coo(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
+  raft::sparse::COO<value_t> comp_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
                                                      d_alloc, stream);
 
   ML::PUSH_RANGE("umap::optimization");
diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 8543a5e634..81b242cdd5 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -31,6 +31,8 @@
 #include <string>
 #include "optimize_batch_kernel.cuh"
 
+#include <sparse/op/filter.cuh>
+
 #pragma once
 
 namespace UMAPAlgo {
@@ -194,7 +196,7 @@ void optimize_layout(T *head_embedding, int head_n, T *tail_embedding,
  * and their 1-skeletons.
  */
 template <int TPB_X, typename T>
-void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
+void launcher(int m, int n, raft::sparse::COO<T> *in, UMAPParams *params,
               T *embedding, std::shared_ptr<deviceAllocator> d_alloc,
               cudaStream_t stream) {
   int nnz = in->nnz;
@@ -228,8 +230,8 @@ void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
     },
     stream);
 
-  MLCommon::Sparse::COO<T> out(d_alloc, stream);
-  MLCommon::Sparse::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);
+  raft::sparse::COO<T> out(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);
 
   MLCommon::device_buffer<T> epochs_per_sample(d_alloc, stream, out.nnz);
   CUDA_CHECK(
diff --git a/cpp/src/umap/simpl_set_embed/runner.cuh b/cpp/src/umap/simpl_set_embed/runner.cuh
index c8b95b0842..59c3b4c812 100644
--- a/cpp/src/umap/simpl_set_embed/runner.cuh
+++ b/cpp/src/umap/simpl_set_embed/runner.cuh
@@ -28,7 +28,7 @@ namespace SimplSetEmbed {
 using namespace ML;
 
 template <int TPB_X, typename T>
-void run(int m, int n, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
+void run(int m, int n, raft::sparse::COO<T> *coo, UMAPParams *params,
          T *embedding, std::shared_ptr<deviceAllocator> alloc,
          cudaStream_t stream, int algorithm = 0) {
   switch (algorithm) {
diff --git a/cpp/src/umap/supervised.cuh b/cpp/src/umap/supervised.cuh
index 5fe50c8cf5..b2d9a5414b 100644
--- a/cpp/src/umap/supervised.cuh
+++ b/cpp/src/umap/supervised.cuh
@@ -34,8 +34,12 @@
 #include <thrust/scan.h>
 #include <thrust/system/cuda/execution_policy.h>
 
+#include <sparse/convert/csr.cuh>
 #include <sparse/coo.cuh>
-#include <sparse/csr.cuh>
+#include <sparse/linalg/add.cuh>
+#include <sparse/linalg/norm.cuh>
+#include <sparse/linalg/symmetrize.cuh>
+#include <sparse/op/filter.cuh>
 
 #include <raft/cuda_utils.cuh>
 
@@ -47,8 +51,6 @@ namespace Supervised {
 
 using namespace ML;
 
-using namespace MLCommon::Sparse;
-
 template <int TPB_X, typename T>
 __global__ void fast_intersection_kernel(int *rows, int *cols, T *vals, int nnz,
                                          T *target, float unknown_dist = 1.0,
@@ -65,21 +67,23 @@ __global__ void fast_intersection_kernel(int *rows, int *cols, T *vals, int nnz,
 }
 
 template <typename T, int TPB_X>
-void reset_local_connectivity(COO<T> *in_coo, COO<T> *out_coo,
+void reset_local_connectivity(raft::sparse::COO<T> *in_coo,
+                              raft::sparse::COO<T> *out_coo,
                               std::shared_ptr<deviceAllocator> d_alloc,
                               cudaStream_t stream  // size = nnz*2
 ) {
   MLCommon::device_buffer<int> row_ind(d_alloc, stream, in_coo->n_rows);
 
-  MLCommon::Sparse::sorted_coo_to_csr(in_coo, row_ind.data(), d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(in_coo, row_ind.data(), d_alloc,
+                                           stream);
 
   // Perform l_inf normalization
-  MLCommon::Sparse::csr_row_normalize_max<TPB_X, T>(
+  raft::sparse::linalg::csr_row_normalize_max<TPB_X, T>(
     row_ind.data(), in_coo->vals(), in_coo->nnz, in_coo->n_rows, in_coo->vals(),
     stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  MLCommon::Sparse::coo_symmetrize<TPB_X, T>(
+  raft::sparse::linalg::coo_symmetrize<TPB_X, T>(
     in_coo, out_coo,
     [] __device__(int row, int col, T result, T transpose) {
       T prod_matrix = result * transpose;
@@ -98,11 +102,9 @@ void reset_local_connectivity(COO<T> *in_coo, COO<T> *out_coo,
  * data.
  */
 template <typename value_t, int TPB_X>
-void categorical_simplicial_set_intersection(COO<value_t> *graph_coo,
-                                             value_t *target,
-                                             cudaStream_t stream,
-                                             float far_dist = 5.0,
-                                             float unknown_dist = 1.0) {
+void categorical_simplicial_set_intersection(
+  raft::sparse::COO<value_t> *graph_coo, value_t *target, cudaStream_t stream,
+  float far_dist = 5.0, float unknown_dist = 1.0) {
   dim3 grid(raft::ceildiv(graph_coo->nnz, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
   fast_intersection_kernel<TPB_X, value_t><<<grid, blk, 0, stream>>>(
@@ -120,13 +122,13 @@ __global__ void sset_intersection_kernel(
 
   if (row < m) {
     int start_idx_res = result_ind[row];
-    int stop_idx_res = MLCommon::Sparse::get_stop_idx(row, m, nnz, result_ind);
+    int stop_idx_res = raft::sparse::get_stop_idx(row, m, nnz, result_ind);
 
     int start_idx1 = row_ind1[row];
-    int stop_idx1 = MLCommon::Sparse::get_stop_idx(row, m, nnz1, row_ind1);
+    int stop_idx1 = raft::sparse::get_stop_idx(row, m, nnz1, row_ind1);
 
     int start_idx2 = row_ind2[row];
-    int stop_idx2 = MLCommon::Sparse::get_stop_idx(row, m, nnz2, row_ind2);
+    int stop_idx2 = raft::sparse::get_stop_idx(row, m, nnz2, row_ind2);
 
     for (int j = start_idx_res; j < stop_idx_res; j++) {
       int col = result_cols[j];
@@ -164,13 +166,14 @@ __global__ void sset_intersection_kernel(
  */
 template <typename T, int TPB_X>
 void general_simplicial_set_intersection(
-  int *row1_ind, COO<T> *in1, int *row2_ind, COO<T> *in2, COO<T> *result,
-  float weight, std::shared_ptr<deviceAllocator> d_alloc, cudaStream_t stream) {
+  int *row1_ind, raft::sparse::COO<T> *in1, int *row2_ind,
+  raft::sparse::COO<T> *in2, raft::sparse::COO<T> *result, float weight,
+  std::shared_ptr<deviceAllocator> d_alloc, cudaStream_t stream) {
   MLCommon::device_buffer<int> result_ind(d_alloc, stream, in1->n_rows);
   CUDA_CHECK(
     cudaMemsetAsync(result_ind.data(), 0, in1->n_rows * sizeof(int), stream));
 
-  int result_nnz = MLCommon::Sparse::csr_add_calc_inds<float, 32>(
+  int result_nnz = raft::sparse::linalg::csr_add_calc_inds<float, 32>(
     row1_ind, in1->cols(), in1->vals(), in1->nnz, row2_ind, in2->cols(),
     in2->vals(), in2->nnz, in1->n_rows, result_ind.data(), d_alloc, stream);
 
@@ -179,14 +182,14 @@ void general_simplicial_set_intersection(
   /**
    * Element-wise sum of two simplicial sets
    */
-  MLCommon::Sparse::csr_add_finalize<float, 32>(
+  raft::sparse::linalg::csr_add_finalize<float, 32>(
     row1_ind, in1->cols(), in1->vals(), in1->nnz, row2_ind, in2->cols(),
     in2->vals(), in2->nnz, in1->n_rows, result_ind.data(), result->cols(),
     result->vals(), stream);
 
   //@todo: Write a wrapper function for this
-  MLCommon::Sparse::csr_to_coo<int, TPB_X>(result_ind.data(), result->n_rows,
-                                           result->rows(), result->nnz, stream);
+  raft::sparse::convert::csr_to_coo<int, TPB_X>(
+    result_ind.data(), result->n_rows, result->rows(), result->nnz, stream);
 
   thrust::device_ptr<const T> d_ptr1 = thrust::device_pointer_cast(in1->vals());
   T min1 = *(thrust::min_element(thrust::cuda::par.on(stream), d_ptr1,
@@ -212,8 +215,9 @@ void general_simplicial_set_intersection(
 }
 
 template <int TPB_X, typename T>
-void perform_categorical_intersection(T *y, COO<T> *rgraph_coo,
-                                      COO<T> *final_coo, UMAPParams *params,
+void perform_categorical_intersection(T *y, raft::sparse::COO<T> *rgraph_coo,
+                                      raft::sparse::COO<T> *final_coo,
+                                      UMAPParams *params,
                                       std::shared_ptr<deviceAllocator> d_alloc,
                                       cudaStream_t stream) {
   float far_dist = 1.0e12;  // target weight
@@ -223,8 +227,9 @@ void perform_categorical_intersection(T *y, COO<T> *rgraph_coo,
   categorical_simplicial_set_intersection<T, TPB_X>(rgraph_coo, y, stream,
                                                     far_dist);
 
-  COO<T> comp_coo(d_alloc, stream);
-  coo_remove_zeros<TPB_X, T>(rgraph_coo, &comp_coo, d_alloc, stream);
+  raft::sparse::COO<T> comp_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, T>(rgraph_coo, &comp_coo, d_alloc,
+                                               stream);
 
   reset_local_connectivity<T, TPB_X>(&comp_coo, final_coo, d_alloc, stream);
 
@@ -233,9 +238,9 @@ void perform_categorical_intersection(T *y, COO<T> *rgraph_coo,
 
 template <int TPB_X, typename value_idx, typename value_t>
 void perform_general_intersection(const raft::handle_t &handle, value_t *y,
-                                  COO<value_t> *rgraph_coo,
-                                  COO<value_t> *final_coo, UMAPParams *params,
-                                  cudaStream_t stream) {
+                                  raft::sparse::COO<value_t> *rgraph_coo,
+                                  raft::sparse::COO<value_t> *final_coo,
+                                  UMAPParams *params, cudaStream_t stream) {
   auto d_alloc = handle.get_device_allocator();
 
   /**
@@ -272,7 +277,7 @@ void perform_general_intersection(const raft::handle_t &handle, value_t *y,
   /**
    * Compute fuzzy simplicial set
    */
-  COO<value_t> ygraph_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> ygraph_coo(d_alloc, stream);
 
   FuzzySimplSet::run<TPB_X, value_idx, value_t>(
     rgraph_coo->n_rows, y_knn_indices.data(), y_knn_dists.data(),
@@ -297,15 +302,16 @@ void perform_general_intersection(const raft::handle_t &handle, value_t *y,
   CUDA_CHECK(cudaMemsetAsync(yrow_ind.data(), 0,
                              ygraph_coo.n_rows * sizeof(int), stream));
 
-  COO<value_t> cygraph_coo(d_alloc, stream);
-  coo_remove_zeros<TPB_X, value_t>(&ygraph_coo, &cygraph_coo, d_alloc, stream);
+  raft::sparse::COO<value_t> cygraph_coo(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&ygraph_coo, &cygraph_coo,
+                                                     d_alloc, stream);
 
-  MLCommon::Sparse::sorted_coo_to_csr(&cygraph_coo, yrow_ind.data(), d_alloc,
-                                      stream);
-  MLCommon::Sparse::sorted_coo_to_csr(rgraph_coo, xrow_ind.data(), d_alloc,
-                                      stream);
+  raft::sparse::convert::sorted_coo_to_csr(&cygraph_coo, yrow_ind.data(),
+                                           d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(rgraph_coo, xrow_ind.data(), d_alloc,
+                                           stream);
 
-  COO<value_t> result_coo(d_alloc, stream);
+  raft::sparse::COO<value_t> result_coo(d_alloc, stream);
   general_simplicial_set_intersection<value_t, TPB_X>(
     xrow_ind.data(), rgraph_coo, yrow_ind.data(), &cygraph_coo, &result_coo,
     params->target_weights, d_alloc, stream);
@@ -313,8 +319,9 @@ void perform_general_intersection(const raft::handle_t &handle, value_t *y,
   /**
    * Remove zeros
    */
-  COO<value_t> out(d_alloc, stream);
-  coo_remove_zeros<TPB_X, value_t>(&result_coo, &out, d_alloc, stream);
+  raft::sparse::COO<value_t> out(d_alloc, stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&result_coo, &out, d_alloc,
+                                                     stream);
 
   reset_local_connectivity<value_t, TPB_X>(&out, final_coo, d_alloc, stream);
 
diff --git a/cpp/src_prims/selection/columnWiseSort.cuh b/cpp/src_prims/selection/columnWiseSort.cuh
index d80e5dd9f1..6f1563c3d8 100644
--- a/cpp/src_prims/selection/columnWiseSort.cuh
+++ b/cpp/src_prims/selection/columnWiseSort.cuh
@@ -175,8 +175,7 @@ template <typename InType, typename OutType>
 void sortColumnsPerRow(const InType *in, OutType *out, int n_rows,
                        int n_columns, bool &bAllocWorkspace, void *workspacePtr,
                        size_t &workspaceSize, cudaStream_t stream,
-                       InType *sortedKeys = nullptr, bool ascending = true) {
-  ASSERT(ascending, "Descending sort not implemented yet");
+                       InType *sortedKeys = nullptr) {
   // assume non-square row-major matrices
   // current use-case: KNN, trustworthiness scores
   // output : either sorted indices or sorted indices and input values
@@ -232,17 +231,10 @@ void sortColumnsPerRow(const InType *in, OutType *out, int n_rows,
       OutType *tmpValIn = nullptr;
       int *tmpOffsetBuffer = nullptr;
 
-      if (ascending) {
-        // first call is to get size of workspace
-        CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairs(
-          workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out,
-          totalElements, numSegments, tmpOffsetBuffer, tmpOffsetBuffer + 1));
-      } else {
-        // first call is to get size of workspace
-        CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairsDescending(
-          workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out,
-          totalElements, numSegments, tmpOffsetBuffer, tmpOffsetBuffer + 1));
-      }
+      // first call is to get size of workspace
+      CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairs(
+        workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out,
+        totalElements, numSegments, tmpOffsetBuffer, tmpOffsetBuffer + 1));
       bAllocWorkspace = true;
       // more staging space for temp output of keys
       if (!sortedKeys)
@@ -283,17 +275,10 @@ void sortColumnsPerRow(const InType *in, OutType *out, int n_rows,
       CUDA_CHECK(
         layoutSortOffset(dSegmentOffsets, n_columns, numSegments, stream));
 
-      if (ascending) {
-        CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairs(
-          workspacePtr, workspaceSize, in, sortedKeys, dValuesIn, out,
-          totalElements, numSegments, dSegmentOffsets, dSegmentOffsets + 1, 0,
-          sizeof(InType) * 8, stream));
-      } else {
-        CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairsDescending(
-          workspacePtr, workspaceSize, in, sortedKeys, dValuesIn, out,
-          totalElements, numSegments, dSegmentOffsets, dSegmentOffsets + 1, 0,
-          sizeof(InType) * 8, stream));
-      }
+      CUDA_CHECK(cub::DeviceSegmentedRadixSort::SortPairs(
+        workspacePtr, workspaceSize, in, sortedKeys, dValuesIn, out,
+        totalElements, numSegments, dSegmentOffsets, dSegmentOffsets + 1, 0,
+        sizeof(InType) * 8, stream));
     }
   } else {
     // batched per row device wide sort
@@ -337,15 +322,9 @@ void sortColumnsPerRow(const InType *in, OutType *out, int n_rows,
         OutType *rowOut = reinterpret_cast<OutType *>(
           (size_t)out + (i * sizeof(OutType) * (size_t)n_columns));
 
-        if (ascending) {
-          CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
-            workspacePtr, workspaceSize, rowIn, sortedKeys, dValuesIn, rowOut,
-            n_columns));
-        } else {
-          CUDA_CHECK(cub::DeviceRadixSort::SortPairsDescending(
-            workspacePtr, workspaceSize, rowIn, sortedKeys, dValuesIn, rowOut,
-            n_columns));
-        }
+        CUDA_CHECK(cub::DeviceRadixSort::SortPairs(workspacePtr, workspaceSize,
+                                                   rowIn, sortedKeys, dValuesIn,
+                                                   rowOut, n_columns));
 
         if (userKeyOutputBuffer)
           sortedKeys = reinterpret_cast<InType *>(
diff --git a/cpp/src_prims/sparse/convert/coo.cuh b/cpp/src_prims/sparse/convert/coo.cuh
new file mode 100644
index 0000000000..21ea45a0ef
--- /dev/null
+++ b/cpp/src_prims/sparse/convert/coo.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+#include <sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_idx = int, int TPB_X = 32>
+__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
+                                  value_idx *coo_rows, value_idx nnz) {
+  // row-based matrix 1 thread per row
+  value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < m) {
+    value_idx start_idx = row_ind[row];
+    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
+  }
+}
+
+/**
+ * @brief Convert a CSR row_ind array to a COO rows array
+ * @param row_ind: Input CSR row_ind array
+ * @param m: size of row_ind array
+ * @param coo_rows: Output COO row array
+ * @param nnz: size of output COO row array
+ * @param stream: cuda stream to use
+ */
+template <typename value_idx = int, int TPB_X = 32>
+void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
+                value_idx nnz, cudaStream_t stream) {
+  // @TODO: Use cusparse for this.
+  dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_to_coo_kernel<value_idx, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/convert/csr.cuh b/cpp/src_prims/sparse/convert/csr.cuh
new file mode 100644
index 0000000000..0f5ce6d10f
--- /dev/null
+++ b/cpp/src_prims/sparse/convert/csr.cuh
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+#include <sparse/coo.cuh>
+#include <sparse/linalg/degree.cuh>
+#include <sparse/op/row_op.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
+                const int *srcCols, const value_t *srcVals, int nnz, int m,
+                int *dst_offsets, int *dstCols, value_t *dstVals) {
+  auto stream = handle.get_stream();
+  auto cusparseHandle = handle.get_cusparse_handle();
+  auto d_alloc = handle.get_device_allocator();
+  raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
+  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
+  auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
+    cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
+  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
+  raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
+  CUSPARSE_CHECK(
+    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
+                                     dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
+                             stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
+                                dst_offsets, stream);
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @tparam Lambda function for fused operation in the adj_graph construction
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of vertices in graph
+ * @param nnz number of non-zeros
+ * @param batchSize number of vertices in current batch
+ * @param adj an adjacency array (size batchSize x total_rows)
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op: the fused operation
+ */
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream,
+                           Lambda fused_op) {
+  op::csr_row_op<Index_, TPB_X>(
+    row_ind, batchSize, nnz,
+    [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
+      Index_ row, Index_ start_idx, Index_ stop_idx) {
+      fused_op(row, start_idx, stop_idx);
+      Index_ k = 0;
+      for (Index_ i = 0; i < total_rows; i++) {
+        // @todo: uncoalesced mem accesses!
+        if (adj[batchSize * i + row]) {
+          row_ind_ptr[start_idx + k] = i;
+          k += 1;
+        }
+      }
+    },
+    stream);
+}
+
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream) {
+  csr_adj_graph_batched(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
+    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of total vertices in graph
+ * @param nnz number of non-zeros
+ * @param adj an adjacency array
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op the fused operation
+ */
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
+                   Lambda fused_op) {
+  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
+                                       adj, row_ind_ptr, stream, fused_op);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param rows: COO rows array
+ * @param nnz: size of COO rows array
+ * @param row_ind: output row indices array
+ * @param m: number of rows in dense matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
+
+  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
+
+  linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
+
+  // create csr compressed row index from row counts
+  thrust::device_ptr<T> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param coo: Input COO matrix
+ * @param row_ind: output row indices array
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
+                    stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/convert/dense.cuh b/cpp/src_prims/sparse/convert/dense.cuh
new file mode 100644
index 0000000000..772596f6df
--- /dev/null
+++ b/cpp/src_prims/sparse/convert/dense.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
+                                                 const value_t *csrVal,
+                                                 const int *csrRowPtr,
+                                                 const int *csrColInd,
+                                                 value_t *a) {
+  int row = blockIdx.x;
+  int tid = threadIdx.x;
+
+  int colStart = csrRowPtr[row];
+  int colEnd = csrRowPtr[row + 1];
+  int rowNnz = colEnd - colStart;
+
+  for (int i = tid; i < rowNnz; i += blockDim.x) {
+    int colIdx = colStart + i;
+    if (colIdx < colEnd) {
+      int col = csrColInd[colIdx];
+      a[row * n_cols + col] = csrVal[colIdx];
+    }
+  }
+}
+
+/**
+ * Convert CSR arrays to a dense matrix in either row-
+ * or column-major format. A custom kernel is used when
+ * row-major output is desired since cusparse does not
+ * output row-major.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR value array
+ * @param[in] handle : cusparse handle for conversion
+ * @param[in] nrows : number of rows in CSR
+ * @param[in] ncols : number of columns in CSR
+ * @param[in] csr_indptr : CSR row index pointer array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[in] lda : Leading dimension (used for col-major only)
+ * @param[out] out : Dense output array of size nrows * ncols
+ * @param[in] stream : Cuda stream for ordering events
+ * @param[in] row_major : Is row-major output desired?
+ */
+template <typename value_idx, typename value_t>
+void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
+                  const value_idx *csr_indptr, const value_idx *csr_indices,
+                  const value_t *csr_data, value_idx lda, value_t *out,
+                  cudaStream_t stream, bool row_major = true) {
+  if (!row_major) {
+    /**
+     * If we need col-major, use cusparse.
+     */
+    cusparseMatDescr_t out_mat;
+    CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat));
+    CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
+    CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
+
+    CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
+      lda, stream));
+
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
+
+  } else {
+    int blockdim = block_dim(ncols);
+    CUDA_CHECK(
+      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
+      ncols, csr_data, csr_indptr, csr_indices, out);
+  }
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/coo.cuh b/cpp/src_prims/sparse/coo.cuh
index 79da8dc18d..520f29d292 100644
--- a/cpp/src_prims/sparse/coo.cuh
+++ b/cpp/src_prims/sparse/coo.cuh
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include <cuml/common/cuml_allocator.hpp>
-#include "csr.cuh"
-
+#include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-
-#include <common/device_buffer.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
 
 #include <cusparse_v2.h>
 
@@ -28,8 +27,6 @@
 #include <thrust/scan.h>
 
 #include <cuda_runtime.h>
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/device_atomics.cuh>
 
 #include <iostream>
@@ -37,8 +34,8 @@
 
 #pragma once
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 /** @brief A Container object for sparse coordinate. There are two motivations
  * behind using a container for COO arrays.
@@ -61,9 +58,9 @@ namespace Sparse {
 template <typename T, typename Index_Type = int>
 class COO {
  protected:
-  device_buffer<Index_Type> rows_arr;
-  device_buffer<Index_Type> cols_arr;
-  device_buffer<T> vals_arr;
+  raft::mr::device::buffer<Index_Type> rows_arr;
+  raft::mr::device::buffer<Index_Type> cols_arr;
+  raft::mr::device::buffer<T> vals_arr;
 
  public:
   Index_Type nnz;
@@ -74,7 +71,7 @@ class COO {
     * @param d_alloc: the device allocator to use for the underlying buffers
     * @param stream: CUDA stream to use
     */
-  COO(std::shared_ptr<deviceAllocator> d_alloc, cudaStream_t stream)
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
     : rows_arr(d_alloc, stream, 0),
       cols_arr(d_alloc, stream, 0),
       vals_arr(d_alloc, stream, 0),
@@ -90,8 +87,9 @@ class COO {
     * @param n_rows: number of rows in the dense matrix
     * @param n_cols: number of cols in the dense matrix
     */
-  COO(device_buffer<Index_Type> &rows, device_buffer<Index_Type> &cols,
-      device_buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
+  COO(raft::mr::device::buffer<Index_Type> &rows,
+      raft::mr::device::buffer<Index_Type> &cols,
+      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
       Index_Type n_cols = 0)
     : rows_arr(rows),
       cols_arr(cols),
@@ -108,7 +106,7 @@ class COO {
     * @param n_cols: number of cols in the dense matrix
     * @param init: initialize arrays with zeros
     */
-  COO(std::shared_ptr<deviceAllocator> d_alloc, cudaStream_t stream,
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
       Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
       bool init = true)
     : rows_arr(d_alloc, stream, nnz),
@@ -257,731 +255,5 @@ class COO {
   }
 };
 
-/**
- * @brief Sorts the arrays that comprise the coo matrix
- * by row.
- *
- * @param m number of rows in coo matrix
- * @param n number of cols in coo matrix
- * @param nnz number of non-zeros
- * @param rows rows array from coo matrix
- * @param cols cols array from coo matrix
- * @param vals vals array from coo matrix
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <typename T>
-void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
-              std::shared_ptr<deviceAllocator> d_alloc, cudaStream_t stream) {
-  cusparseHandle_t handle = NULL;
-
-  size_t pBufferSizeInBytes = 0;
-
-  CUSPARSE_CHECK(cusparseCreate(&handle));
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, rows, cols,
-                                                &pBufferSizeInBytes));
-
-  device_buffer<int> d_P(d_alloc, stream, nnz);
-  device_buffer<char> pBuffer(d_alloc, stream, pBufferSizeInBytes);
-
-  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, d_P.data()));
-
-  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, rows, cols,
-                                       d_P.data(), pBuffer.data()));
-
-  device_buffer<T> vals_sorted(d_alloc, stream, nnz);
-
-  CUSPARSE_CHECK(raft::sparse::cusparsegthr<T>(
-    handle, nnz, vals, vals_sorted.data(), d_P.data(), stream));
-
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  raft::copy(vals, vals_sorted.data(), nnz, stream);
-
-  CUSPARSE_CHECK(cusparseDestroy(handle));
-}
-
-/**
- * @brief Sort the underlying COO arrays by row
- * @tparam T: the type name of the underlying value array
- * @param in: COO to sort by row
- * @param d_alloc device allocator for temporary buffers
- * @param stream: the cuda stream to use
- */
-template <typename T>
-void coo_sort(COO<T> *const in, std::shared_ptr<deviceAllocator> d_alloc,
-              cudaStream_t stream) {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), d_alloc, stream);
-}
-
-template <int TPB_X, typename T>
-__global__ void coo_remove_zeros_kernel(const int *rows, const int *cols,
-                                        const T *vals, int nnz, int *crows,
-                                        int *ccols, T *cvals, int *ex_scan,
-                                        int *cur_ex_scan, int m) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    int start = cur_ex_scan[row];
-    int stop = MLCommon::Sparse::get_stop_idx(row, m, nnz, cur_ex_scan);
-    int cur_out_idx = ex_scan[row];
-
-    for (int idx = start; idx < stop; idx++) {
-      if (vals[idx] != 0.0) {
-        crows[cur_out_idx] = rows[idx];
-        ccols[cur_out_idx] = cols[idx];
-        cvals[cur_out_idx] = vals[idx];
-        ++cur_out_idx;
-      }
-    }
-  }
-}
-
-template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
-                                         const T *vals, int nnz, int *crows,
-                                         int *ccols, T *cvals, int *ex_scan,
-                                         int *cur_ex_scan, int m, T scalar) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    int start = cur_ex_scan[row];
-    int stop = MLCommon::Sparse::get_stop_idx(row, m, nnz, cur_ex_scan);
-    int cur_out_idx = ex_scan[row];
-
-    for (int idx = start; idx < stop; idx++) {
-      if (vals[idx] != scalar) {
-        crows[cur_out_idx] = rows[idx];
-        ccols[cur_out_idx] = cols[idx];
-        cvals[cur_out_idx] = vals[idx];
-        ++cur_out_idx;
-      }
-    }
-  }
-}
-
-/**
- * @brief Count all the rows in the coo row array and place them in the
- * results matrix, indexed by row.
- *
- * @tparam TPB_X: number of threads to use per block
- * @param rows the rows array of the coo matrix
- * @param nnz the size of the rows array
- * @param results array to place results
- */
-template <int TPB_X>
-__global__ void coo_row_count_kernel(const int *rows, int nnz, int *results) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
-}
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @param rows: rows array of the COO matrix
- * @param nnz: size of the rows array
- * @param results: output result array
- * @param stream: cuda stream to use
- */
-template <int TPB_X>
-void coo_row_count(const int *rows, int nnz, int *results,
-                   cudaStream_t stream) {
-  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_kernel<TPB_X>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, nnz, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: type name of underlying values array
- * @param in: input COO object for counting rows
- * @param results: output array with row counts (size=in->n_rows)
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_row_count(COO<T> *in, int *results, cudaStream_t stream) {
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_kernel<TPB_X>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-template <int TPB_X, typename T>
-__global__ void coo_row_count_nz_kernel(const int *rows, const T *vals, int nnz,
-                                        int *results) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != 0.0) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
-}
-
-template <int TPB_X, typename T>
-__global__ void coo_row_count_scalar_kernel(const int *rows, const T *vals,
-                                            int nnz, T scalar, int *results) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
-}
-
-/**
- * @brief Count the number of values for each row matching a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_row_count_scalar(COO<T> *in, T scalar, int *results,
-                          cudaStream_t stream) {
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
-    in->rows(), in->vals(), in->nnz, scalar, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * @brief Count the number of values for each row matching a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_row_count_scalar(const int *rows, const T *vals, int nnz, T scalar,
-                          int *results, cudaStream_t stream = 0) {
-  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_scalar_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, scalar, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * @brief Count the number of nonzeros for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param rows: Input COO row array
- * @param vals: Input COO val arrays
- * @param nnz: size of input COO arrays
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_row_count_nz(const int *rows, const T *vals, int nnz, int *results,
-                      cudaStream_t stream) {
-  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * @brief Count the number of nonzero values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_row_count_nz(COO<T> *in, int *results, cudaStream_t stream) {
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_row_count_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param rows: input array of rows (size n)
- * @param cols: input array of cols (size n)
- * @param vals: input array of vals (size n)
- * @param nnz: size of current rows/cols/vals arrays
- * @param crows: compressed array of rows
- * @param ccols: compressed array of cols
- * @param cvals: compressed array of vals
- * @param cnnz: array of non-zero counts per row
- * @param cur_cnnz array of counts per row
- * @param scalar: scalar to remove from arrays
- * @param n: number of rows in dense matrix
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
-                       int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n,
-                       std::shared_ptr<deviceAllocator> d_alloc,
-                       cudaStream_t stream) {
-  device_buffer<int> ex_scan(d_alloc, stream, n);
-  device_buffer<int> cur_ex_scan(d_alloc, stream, n);
-
-  CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
-
-  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan =
-    thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
-                         dev_ex_scan);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan =
-    thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
-                         dev_cur_cnnz + n, dev_cur_ex_scan);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
-    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
-    dev_cur_ex_scan.get(), n, scalar);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-/**
- * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param scalar: scalar to remove from arrays
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
-                       std::shared_ptr<deviceAllocator> d_alloc,
-                       cudaStream_t stream) {
-  device_buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
-  device_buffer<int> row_count(d_alloc, stream, in->n_rows);
-
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
-
-  MLCommon::Sparse::coo_row_count<TPB_X>(in->rows(), in->nnz, row_count.data(),
-                                         stream);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  MLCommon::Sparse::coo_row_count_scalar<TPB_X>(
-    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  thrust::device_ptr<int> d_row_count_nz =
-    thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
-                               d_row_count_nz + in->n_rows);
-
-  out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
-
-  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
-                              out->rows(), out->cols(), out->vals(),
-                              row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, d_alloc, stream);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-/**
- * @brief Removes zeros from a COO formatted sparse matrix.
- *
- * @param in: input COO matrix
- * @param out: output COO matrix
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out,
-                      std::shared_ptr<deviceAllocator> d_alloc,
-                      cudaStream_t stream) {
-  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
-}
-
-template <int TPB_X, typename T>
-__global__ void from_knn_graph_kernel(const long *knn_indices,
-                                      const T *knn_dists, int m, int k,
-                                      int *rows, int *cols, T *vals) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < m) {
-    for (int i = 0; i < k; i++) {
-      rows[row * k + i] = row;
-      cols[row * k + i] = knn_indices[row * k + i];
-      vals[row * k + i] = knn_dists[row * k + i];
-    }
-  }
-}
-
-/**
- * @brief Converts a knn graph, defined by index and distance matrices,
- * into COO format.
- *
- * @param knn_indices: knn index array
- * @param knn_dists: knn distance array
- * @param m: number of vertices in graph
- * @param k: number of nearest neighbors
- * @param rows: output COO row array
- * @param cols: output COO col array
- * @param vals: output COO val array
- */
-template <typename T>
-void from_knn(const long *knn_indices, const T *knn_dists, int m, int k,
-              int *rows, int *cols, T *vals) {
-  dim3 grid(raft::ceildiv(m, 32), 1, 1);
-  dim3 blk(32, 1, 1);
-  from_knn_graph_kernel<32, T>
-    <<<grid, blk>>>(knn_indices, knn_dists, m, k, rows, cols, vals);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-/**
- * Converts a knn graph, defined by index and distance matrices,
- * into COO format.
- * @param knn_indices: KNN index array (size m * k)
- * @param knn_dists: KNN dist array (size m * k)
- * @param m: number of vertices in graph
- * @param k: number of nearest neighbors
- * @param out: The output COO graph from the KNN matrices
- * @param stream: CUDA stream to use
- */
-template <typename T>
-void from_knn(const long *knn_indices, const T *knn_dists, int m, int k,
-              COO<T> *out, cudaStream_t stream) {
-  out->allocate(m * k, m, m, true, stream);
-
-  from_knn(knn_indices, knn_dists, m, k, out->rows(), out->cols(), out->vals());
-}
-
-/**
- * @brief Generate the row indices array for a sorted COO matrix
- *
- * @param rows: COO rows array
- * @param nnz: size of COO rows array
- * @param row_ind: output row indices array
- * @param m: number of rows in dense matrix
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <typename T>
-void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
-                       std::shared_ptr<deviceAllocator> d_alloc,
-                       cudaStream_t stream) {
-  device_buffer<T> row_counts(d_alloc, stream, m);
-
-  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
-
-  coo_row_count<32>(rows, nnz, row_counts.data(), stream);
-
-  // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
-}
-
-/**
- * @brief Generate the row indices array for a sorted COO matrix
- *
- * @param coo: Input COO matrix
- * @param row_ind: output row indices array
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
-                       std::shared_ptr<deviceAllocator> d_alloc,
-                       cudaStream_t stream) {
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
-                    stream);
-}
-
-template <int TPB_X, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
-                                      T *vals, int *orows, int *ocols, T *ovals,
-                                      int n, int cnnz, Lambda reduction_op) {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < n) {
-    int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx = MLCommon::Sparse::get_stop_idx(row, n, cnnz, row_ind);
-
-    int row_nnz = 0;
-    int out_start_idx = start_idx * 2;
-
-    for (int idx = 0; idx < stop_idx - start_idx; idx++) {
-      int cur_row = rows[idx + start_idx];
-      int cur_col = cols[idx + start_idx];
-      T cur_val = vals[idx + start_idx];
-
-      int lookup_row = cur_col;
-      int t_start = row_ind[lookup_row];  // Start at
-      int t_stop = MLCommon::Sparse::get_stop_idx(lookup_row, n, cnnz, row_ind);
-
-      T transpose = 0.0;
-
-      bool found_match = false;
-      for (int t_idx = t_start; t_idx < t_stop; t_idx++) {
-        // If we find a match, let's get out of the loop. We won't
-        // need to modify the transposed value, since that will be
-        // done in a different thread.
-        if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
-          // If it exists already, set transposed value to existing value
-          transpose = vals[t_idx];
-          found_match = true;
-          break;
-        }
-      }
-
-      // Custom reduction op on value and its transpose, which enables
-      // specialized weighting.
-      // If only simple X+X.T is desired, this op can just sum
-      // the two values.
-      T res = reduction_op(cur_row, cur_col, cur_val, transpose);
-
-      // if we didn't find an exact match, we need to add
-      // the computed res into our current matrix to guarantee
-      // symmetry.
-      // Note that if we did find a match, we don't need to
-      // compute `res` on it here because it will be computed
-      // in a different thread.
-      if (!found_match && vals[idx] != 0.0) {
-        orows[out_start_idx + row_nnz] = cur_col;
-        ocols[out_start_idx + row_nnz] = cur_row;
-        ovals[out_start_idx + row_nnz] = res;
-        ++row_nnz;
-      }
-
-      if (res != 0.0) {
-        orows[out_start_idx + row_nnz] = cur_row;
-        ocols[out_start_idx + row_nnz] = cur_col;
-        ovals[out_start_idx + row_nnz] = res;
-        ++row_nnz;
-      }
-    }
-  }
-}
-
-/**
- * @brief takes a COO matrix which may not be symmetric and symmetrizes
- * it, running a custom reduction function against the each value
- * and its transposed value.
- *
- * @param in: Input COO matrix
- * @param out: Output symmetrized COO matrix
- * @param reduction_op: a custom reduction function
- * @param d_alloc device allocator for temporary buffers
- * @param stream: cuda stream to use
- */
-template <int TPB_X, typename T, typename Lambda>
-void coo_symmetrize(COO<T> *in, COO<T> *out,
-                    Lambda reduction_op,  // two-argument reducer
-                    std::shared_ptr<deviceAllocator> d_alloc,
-                    cudaStream_t stream) {
-  dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
-
-  device_buffer<int> in_row_ind(d_alloc, stream, in->n_rows);
-
-  sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
-
-  out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
-
-  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
-    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
-    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-/**
- * @brief Find how much space needed in each row.
- * We look through all datapoints and increment the count for each row.
- *
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input empty row sum 1 array(n)
- * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
- */
-template <typename value_idx, typename value_t>
-__global__ static void symmetric_find_size(const value_t *restrict data,
-                                           const value_idx *restrict indices,
-                                           const value_idx n, const int k,
-                                           value_idx *restrict row_sizes,
-                                           value_idx *restrict row_sizes2) {
-  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
-  if (row >= n || j >= k) return;
-
-  const auto col = indices[row * k + j];
-  if (j % 2)
-    atomicAdd(&row_sizes[col], (value_idx)1);
-  else
-    atomicAdd(&row_sizes2[col], (value_idx)1);
-}
-
-/**
- * @brief Reduce sum(row_sizes) + k
- * Reduction for symmetric_find_size kernel. Allows algo to be faster.
- *
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input row sum 1 array(n)
- * @param row_sizes2: Input row sum 2 array(n) for faster reduction
- */
-template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n, const int k,
-                                        value_idx *restrict row_sizes,
-                                        const value_idx *restrict row_sizes2) {
-  const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (i >= n) return;
-  row_sizes[i] += (row_sizes2[i] + k);
-}
-
-/**
- * @brief Perform data + data.T operation.
- * Can only run once row_sizes from the CSR matrix of data + data.T has been
- * determined.
- *
- * @param edges: Input row sum array(n) after reduction
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param VAL: Output values for data + data.T
- * @param COL: Output column indices for data + data.T
- * @param ROW: Output row indices for data + data.T
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- */
-template <typename value_idx, typename value_t>
-__global__ static void symmetric_sum(value_idx *restrict edges,
-                                     const value_t *restrict data,
-                                     const value_idx *restrict indices,
-                                     value_t *restrict VAL,
-                                     value_idx *restrict COL,
-                                     value_idx *restrict ROW, const value_idx n,
-                                     const int k) {
-  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
-  if (row >= n || j >= k) return;
-
-  const auto col = indices[row * k + j];
-  const auto original = atomicAdd(&edges[row], (value_idx)1);
-  const auto transpose = atomicAdd(&edges[col], (value_idx)1);
-
-  VAL[transpose] = VAL[original] = data[row * k + j];
-  // Notice swapped ROW, COL since transpose
-  ROW[original] = row;
-  COL[original] = col;
-
-  ROW[transpose] = col;
-  COL[transpose] = row;
-}
-
-/**
- * @brief Perform data + data.T on raw KNN data.
- * The following steps are invoked:
- * (1) Find how much space needed in each row
- * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
- * (3) Allocate new space
- * (4) Prepare edges for each new row
- * (5) Perform final data + data.T operation
- * (6) Return summed up VAL, COL, ROW
- *
- * @param knn_indices: Input knn distances(n, k)
- * @param knn_dists: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param out: Output COO Matrix class
- * @param stream: Input cuda stream
- * @param d_alloc device allocator for temporary buffers
- */
-template <typename value_idx, typename value_t, int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
-                                const value_t *restrict knn_dists,
-                                const value_idx n, const int k,
-                                COO<value_t, value_idx> *out,
-                                cudaStream_t stream,
-                                std::shared_ptr<deviceAllocator> d_alloc) {
-  // (1) Find how much space needed in each row
-  // We look through all datapoints and increment the count for each row.
-  const dim3 threadsPerBlock(TPB_X, TPB_Y);
-  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
-                       raft::ceildiv(k, TPB_Y));
-
-  // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
-  device_buffer<value_idx> row_sizes(d_alloc, stream, n);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
-
-  device_buffer<value_idx> row_sizes2(d_alloc, stream, n);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
-
-  symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
-    knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  reduce_find_size<<<raft::ceildiv(n, (value_idx)1024), 1024, 0, stream>>>(
-    n, k, row_sizes.data(), row_sizes2.data());
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
-  // Notice we don't do any merging and leave the result as 2*NNZ
-  const auto NNZ = 2 * n * k;
-
-  // (3) Allocate new space
-  out->allocate(NNZ, n, n, true, stream);
-
-  // (4) Prepare edges for each new row
-  // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
-  // are calculated as the cumulative rolling sum of the previous rows.
-  // Notice reusing old row_sizes2 memory
-  value_idx *edges = row_sizes2.data();
-  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
-  thrust::device_ptr<value_idx> __row_sizes =
-    thrust::device_pointer_cast(row_sizes.data());
-
-  // Rolling cumulative sum
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
-                         __row_sizes + n, __edges);
-
-  // (5) Perform final data + data.T operation in tandem with memcpying
-  symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
-    edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-};  // namespace Sparse
-};  // namespace MLCommon
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/csr.cuh b/cpp/src_prims/sparse/csr.cuh
index e43bbd850d..b312810cf5 100644
--- a/cpp/src_prims/sparse/csr.cuh
+++ b/cpp/src_prims/sparse/csr.cuh
@@ -22,6 +22,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
 
 #include <label/classlabels.cuh>
 
@@ -36,634 +37,8 @@
 
 #include <sparse/utils.h>
 
-namespace MLCommon {
-namespace Sparse {
-
-static const float MIN_FLOAT = std::numeric_limits<float>::min();
-
-template <typename value_t>
-__global__ void csr_to_dense_block_per_row_kernel(int n_cols,
-                                                  const value_t *csrVal,
-                                                  const int *csrRowPtr,
-                                                  const int *csrColInd,
-                                                  value_t *a) {
-  int row = blockIdx.x;
-  int tid = threadIdx.x;
-
-  int colStart = csrRowPtr[row];
-  int colEnd = csrRowPtr[row + 1];
-  int rowNnz = colEnd - colStart;
-
-  for (int i = tid; i < rowNnz; i += blockDim.x) {
-    int colIdx = colStart + i;
-    if (colIdx < colEnd) {
-      int col = csrColInd[colIdx];
-      a[row * n_cols + col] = csrVal[colIdx];
-    }
-  }
-}
-
-/**
- * Convert CSR arrays to a dense matrix in either row-
- * or column-major format. A custom kernel is used when
- * row-major output is desired since cusparse does not
- * output row-major.
- * @tparam value_idx : data type of the CSR index arrays
- * @tparam value_t : data type of the CSR value array
- * @param[in] handle : cusparse handle for conversion
- * @param[in] nrows : number of rows in CSR
- * @param[in] ncols : number of columns in CSR
- * @param[in] csr_indptr : CSR row index pointer array
- * @param[in] csr_indices : CSR column indices array
- * @param[in] csr_data : CSR data array
- * @param[in] lda : Leading dimension (used for col-major only)
- * @param[out] out : Dense output array of size nrows * ncols
- * @param[in] stream : Cuda stream for ordering events
- * @param[in] row_major : Is row-major output desired?
- */
-template <typename value_idx, typename value_t>
-void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
-                  const value_idx *csr_indptr, const value_idx *csr_indices,
-                  const value_t *csr_data, value_idx lda, value_t *out,
-                  cudaStream_t stream, bool row_major = true) {
-  if (!row_major) {
-    /**
-     * If we need col-major, use cusparse.
-     */
-    cusparseMatDescr_t out_mat;
-    CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat));
-    CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
-
-    CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
-      lda, stream));
-
-    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
-
-  } else {
-    int blockdim = block_dim(ncols);
-    CUDA_CHECK(
-      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
-    csr_to_dense_block_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
-      ncols, csr_data, csr_indptr, csr_indices, out);
-  }
-}
-
-/**
- * Transpose a set of CSR arrays into a set of CSC arrays.
- * @tparam value_idx : data type of the CSR index arrays
- * @tparam value_t : data type of the CSR data array
- * @param[in] handle : used for invoking cusparse
- * @param[in] csr_indptr : CSR row index array
- * @param[in] csr_indices : CSR column indices array
- * @param[in] csr_data : CSR data array
- * @param[out] csc_indptr : CSC row index array
- * @param[out] csc_indices : CSC column indices array
- * @param[out] csc_data : CSC data array
- * @param[in] csr_nrows : Number of rows in CSR
- * @param[in] csr_ncols : Number of columns in CSR
- * @param[in] nnz : Number of nonzeros of CSR
- * @param[in] allocator : Allocator for intermediate memory
- * @param[in] stream : Cuda stream for ordering events
- */
-template <typename value_idx, typename value_t>
-void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
-                   const value_idx *csr_indices, const value_t *csr_data,
-                   value_idx *csc_indptr, value_idx *csc_indices,
-                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
-                   value_idx nnz, std::shared_ptr<deviceAllocator> allocator,
-                   cudaStream_t stream) {
-  size_t convert_csc_workspace_size = 0;
-
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    &convert_csc_workspace_size, stream));
-
-  CUML_LOG_DEBUG("Transpose workspace size: %d", convert_csc_workspace_size);
-
-  device_buffer<char> convert_csc_workspace(allocator, stream,
-                                            convert_csc_workspace_size);
-
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    convert_csc_workspace.data(), stream));
-}
-
-/**
- * Slice consecutive rows from a CSR array and populate newly sliced indptr array
- * @tparam value_idx
- * @param[in] start_row : beginning row to slice
- * @param[in] stop_row : ending row to slice
- * @param[in] indptr : indptr of input CSR to slice
- * @param[out] indptr_out : output sliced indptr to populate
- * @param[in] start_offset : beginning column offset of input indptr
- * @param[in] stop_offset : ending column offset of input indptr
- * @param[in] stream : cuda stream for ordering events
- */
-template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
-                          const value_idx *indptr, value_idx *indptr_out,
-                          value_idx *start_offset, value_idx *stop_offset,
-                          cudaStream_t stream) {
-  raft::update_host(start_offset, indptr + start_row, 1, stream);
-  raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
-
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  value_idx s_offset = *start_offset;
-
-  // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, we add another 1 to stop row.
-  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
-                   stream);
-
-  raft::linalg::unaryOp<value_idx>(
-    indptr_out, indptr_out, (stop_row + 2) - start_row,
-    [s_offset] __device__(value_idx input) { return input - s_offset; },
-    stream);
-}
-
-/**
- * Slice rows from a CSR, populate column and data arrays
- * @tparam[in] value_idx : data type of CSR index arrays
- * @tparam[in] value_t : data type of CSR data array
- * @param[in] start_offset : beginning column offset to slice
- * @param[in] stop_offset : ending column offset to slice
- * @param[in] indices : column indices array from input CSR
- * @param[in] data : data array from input CSR
- * @param[out] indices_out : output column indices array
- * @param[out] data_out : output data array
- * @param[in] stream : cuda stream for ordering events
- */
-template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
-                            const value_idx *indices, const value_t *data,
-                            value_idx *indices_out, value_t *data_out,
-                            cudaStream_t stream) {
-  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
-             stream);
-  raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
-}
-
-template <int TPB_X, typename T>
-__global__ void csr_row_normalize_l1_kernel(
-  // @TODO: This can be done much more parallel by
-  // having threads in a warp compute the sum in parallel
-  // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ex_scan (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num rows in csr
-  T *result) {             // output array
-
-  // row-based matrix 1 thread per row
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  // sum all vals_arr for row and divide each val by sum
-  if (row < m) {
-    int start_idx = ia[row];
-    int stop_idx = 0;
-    if (row < m - 1) {
-      stop_idx = ia[row + 1];
-    } else
-      stop_idx = nnz;
-
-    T sum = T(0.0);
-    for (int j = start_idx; j < stop_idx; j++) {
-      sum = sum + fabs(vals[j]);
-    }
-
-    for (int j = start_idx; j < stop_idx; j++) {
-      if (sum != 0.0) {
-        T val = vals[j];
-        result[j] = val / sum;
-      } else {
-        result[j] = 0.0;
-      }
-    }
-  }
-}
-
-/**
- * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
- */
-template <int TPB_X = 32, typename T>
-void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
-                          const T *vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
-                          T *result,
-                          cudaStream_t stream) {  // output array
-
-  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  csr_row_normalize_l1_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-template <int TPB_X = 32, typename T>
-__global__ void csr_row_normalize_max_kernel(
-  // @TODO: This can be done much more parallel by
-  // having threads in a warp compute the sum in parallel
-  // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ind array (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num total rows in csr
-  T *result) {             // output array
-
-  // row-based matrix 1 thread per row
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  // find max across columns and divide
-  if (row < m) {
-    int start_idx = ia[row];
-    int stop_idx = 0;
-    if (row < m - 1) {
-      stop_idx = ia[row + 1];
-    } else
-      stop_idx = nnz;
-
-    T max = MIN_FLOAT;
-    for (int j = start_idx; j < stop_idx; j++) {
-      if (vals[j] > max) max = vals[j];
-    }
-
-    // divide nonzeros in current row by max
-    for (int j = start_idx; j < stop_idx; j++) {
-      if (max != 0.0 && max > MIN_FLOAT) {
-        T val = vals[j];
-        result[j] = val / max;
-      } else {
-        result[j] = 0.0;
-      }
-    }
-  }
-}
-
-/**
- * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
- *
- * @param ia: row_ind array
- * @param vals: data array
- * @param nnz: size of data array
- * @param m: size of row_ind array
- * @param result: l1 normalized data array
- * @param stream: cuda stream to use
- */
-
-template <int TPB_X = 32, typename T>
-void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
-                           const T *vals,
-                           int nnz,  // array of values and number of non-zeros
-                           int m,    // num total rows in csr
-                           T *result, cudaStream_t stream) {
-  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  csr_row_normalize_max_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
-  CUDA_CHECK(cudaGetLastError());
-}
-
-template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
-  int stop_idx = 0;
-  if (row < (m - 1))
-    stop_idx = ind[row + 1];
-  else
-    stop_idx = nnz;
-
-  return stop_idx;
-}
-
-template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
-                                  value_idx *coo_rows, value_idx nnz) {
-  // row-based matrix 1 thread per row
-  value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < m) {
-    value_idx start_idx = row_ind[row];
-    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
-    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
-  }
-}
-
-/**
- * @brief Convert a CSR row_ind array to a COO rows array
- * @param row_ind: Input CSR row_ind array
- * @param m: size of row_ind array
- * @param coo_rows: Output COO row array
- * @param nnz: size of output COO row array
- * @param stream: cuda stream to use
- */
-template <typename value_idx = int, int TPB_X = 32>
-void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
-                value_idx nnz, cudaStream_t stream) {
-  // @TODO: Use cusparse for this.
-  dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  csr_to_coo_kernel<value_idx, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
-
-  CUDA_CHECK(cudaGetLastError());
-}
-
-template <typename T, int TPB_X = 32>
-__global__ void csr_add_calc_row_counts_kernel(
-  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
-  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
-  int *out_rowcounts) {
-  // loop through columns in each set of rows and
-  // calculate number of unique cols across both rows
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    int a_start_idx = a_ind[row];
-    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
-
-    int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
-
-    /**
-         * Union of columns within each row of A and B so that we can scan through
-         * them, adding their values together.
-         */
-    int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
-
-    int *arr = new int[max_size];
-    int cur_arr_idx = 0;
-    for (int j = a_start_idx; j < a_stop_idx; j++) {
-      arr[cur_arr_idx] = a_indptr[j];
-      cur_arr_idx++;
-    }
-
-    int arr_size = cur_arr_idx;
-    int final_size = arr_size;
-
-    for (int j = b_start_idx; j < b_stop_idx; j++) {
-      int cur_col = b_indptr[j];
-      bool found = false;
-      for (int k = 0; k < arr_size; k++) {
-        if (arr[k] == cur_col) {
-          found = true;
-          break;
-        }
-      }
-
-      if (!found) {
-        final_size++;
-      }
-    }
-
-    out_rowcounts[row] = final_size;
-    raft::myAtomicAdd(out_rowcounts + m, final_size);
-
-    delete arr;
-  }
-}
-
-template <typename T, int TPB_X = 32>
-__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
-                               const T *a_val, int nnz1, const int *b_ind,
-                               const int *b_indptr, const T *b_val, int nnz2,
-                               int m, int *out_ind, int *out_indptr,
-                               T *out_val) {
-  // 1 thread per row
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    int a_start_idx = a_ind[row];
-    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
-
-    int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
-
-    int o_idx = out_ind[row];
-
-    int cur_o_idx = o_idx;
-    for (int j = a_start_idx; j < a_stop_idx; j++) {
-      out_indptr[cur_o_idx] = a_indptr[j];
-      out_val[cur_o_idx] = a_val[j];
-      cur_o_idx++;
-    }
-
-    int arr_size = cur_o_idx - o_idx;
-    for (int j = b_start_idx; j < b_stop_idx; j++) {
-      int cur_col = b_indptr[j];
-      bool found = false;
-      for (int k = o_idx; k < o_idx + arr_size; k++) {
-        // If we found a match, sum the two values
-        if (out_indptr[k] == cur_col) {
-          out_val[k] += b_val[j];
-          found = true;
-          break;
-        }
-      }
-
-      // if we didn't find a match, add the value for b
-      if (!found) {
-        out_indptr[o_idx + arr_size] = cur_col;
-        out_val[o_idx + arr_size] = b_val[j];
-        arr_size++;
-      }
-    }
-  }
-}
-
-/**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param out_ind: output row_ind array
- * @param d_alloc: deviceAllocator to use for temp memory
- * @param stream: cuda stream to use
- */
-template <typename T, int TPB_X = 32>
-size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
-                         int nnz1, const int *b_ind, const int *b_indptr,
-                         const T *b_val, int nnz2, int m, int *out_ind,
-                         std::shared_ptr<deviceAllocator> d_alloc,
-                         cudaStream_t stream) {
-  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  device_buffer<int> row_counts(d_alloc, stream, m + 1);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
-
-  csr_add_calc_row_counts_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, row_counts.data());
-
-  int cnnz = 0;
-  raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // create csr compressed row index from row counts
-  thrust::device_ptr<int> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
-
-  return cnnz;
-}
-
-/**
- * @brief Calculate the CSR row_ind array that would result
- * from summing together two CSR matrices
- * @param a_ind: left hand row_ind array
- * @param a_indptr: left hand index_ptr array
- * @param a_val: left hand data array
- * @param nnz1: size of left hand index_ptr and val arrays
- * @param b_ind: right hand row_ind array
- * @param b_indptr: right hand index_ptr array
- * @param b_val: right hand data array
- * @param nnz2: size of right hand index_ptr and val arrays
- * @param m: size of output array (number of rows in final matrix)
- * @param c_ind: output row_ind array
- * @param c_indptr: output ind_ptr array
- * @param c_val: output data array
- * @param stream: cuda stream to use
- */
-template <typename T, int TPB_X = 32>
-void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
-                      int nnz1, const int *b_ind, const int *b_indptr,
-                      const T *b_val, int nnz2, int m, int *c_ind,
-                      int *c_indptr, T *c_val, cudaStream_t stream) {
-  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  csr_add_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, c_ind, c_indptr, c_val);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-template <typename T, int TPB_X = 32, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
-                                  Lambda op) {
-  T row = blockIdx.x * TPB_X + threadIdx.x;
-  if (row < n_rows) {
-    T start_idx = row_ind[row];
-    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
-    op(row, start_idx, stop_idx);
-  }
-}
-
-/**
- * @brief Perform a custom row operation on a CSR matrix in batches.
- * @tparam T numerical type of row_ind array
- * @tparam TPB_X number of threads per block to use for underlying kernel
- * @tparam Lambda type of custom operation function
- * @param row_ind the CSR row_ind array to perform parallel operations over
- * @param n_rows total number vertices in graph
- * @param nnz number of non-zeros
- * @param op custom row operation functor accepting the row and beginning index.
- * @param stream cuda stream to use
- */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
-                cudaStream_t stream) {
-  dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-  csr_row_op_kernel<Index_, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
-
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @tparam Lambda function for fused operation in the adj_graph construction
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of vertices in graph
- * @param nnz number of non-zeros
- * @param batchSize number of vertices in current batch
- * @param adj an adjacency array (size batchSize x total_rows)
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op: the fused operation
- */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream,
-                           Lambda fused_op) {
-  csr_row_op<Index_, TPB_X>(
-    row_ind, batchSize, nnz,
-    [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
-      Index_ row, Index_ start_idx, Index_ stop_idx) {
-      fused_op(row, start_idx, stop_idx);
-      Index_ k = 0;
-      for (Index_ i = 0; i < total_rows; i++) {
-        // @todo: uncoalesced mem accesses!
-        if (adj[batchSize * i + row]) {
-          row_ind_ptr[start_idx + k] = i;
-          k += 1;
-        }
-      }
-    },
-    stream);
-}
-
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream) {
-  csr_adj_graph_batched(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
-    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
-}
-
-/**
- * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
- * a row_ind array and adjacency array.
- * @tparam T the numeric type of the index arrays
- * @tparam TPB_X the number of threads to use per block for kernels
- * @param row_ind the input CSR row_ind array
- * @param total_rows number of total vertices in graph
- * @param nnz number of non-zeros
- * @param adj an adjacency array
- * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
- * @param stream cuda stream to use
- * @param fused_op the fused operation
- */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
-                   Lambda fused_op) {
-  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
-                                       adj, row_ind_ptr, stream, fused_op);
-}
+namespace raft {
+namespace sparse {
 
 struct WeakCCState {
  public:
@@ -900,11 +275,12 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 template <typename Index_ = int, int TPB_X = 32,
           typename Lambda = auto(Index_)->bool>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, std::shared_ptr<deviceAllocator> d_alloc,
+             Index_ nnz, Index_ N,
+             std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream, Lambda filter_op) {
-  device_buffer<bool> xa(d_alloc, stream, N);
-  device_buffer<bool> fa(d_alloc, stream, N);
-  device_buffer<bool> m(d_alloc, stream, 1);
+  MLCommon::device_buffer<bool> xa(d_alloc, stream, N);
+  MLCommon::device_buffer<bool> fa(d_alloc, stream, N);
+  MLCommon::device_buffer<bool> m(d_alloc, stream, 1);
 
   WeakCCState state(xa.data(), fa.data(), m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
@@ -935,15 +311,16 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  */
 template <typename Index_, int TPB_X = 32>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, std::shared_ptr<deviceAllocator> d_alloc,
+             Index_ nnz, Index_ N,
+             std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream) {
-  device_buffer<bool> xa(d_alloc, stream, N);
-  device_buffer<bool> fa(d_alloc, stream, N);
-  device_buffer<bool> m(d_alloc, stream, 1);
+  MLCommon::device_buffer<bool> xa(d_alloc, stream, N);
+  MLCommon::device_buffer<bool> fa(d_alloc, stream, N);
+  MLCommon::device_buffer<bool> m(d_alloc, stream, 1);
   WeakCCState state(xa.data(), fa.data(), m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, [](Index_) { return true; });
 }
 
-};  // namespace Sparse
-};  // namespace MLCommon
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/distance.cuh b/cpp/src_prims/sparse/distance/distance.cuh
similarity index 81%
rename from cpp/src_prims/sparse/distance.cuh
rename to cpp/src_prims/sparse/distance/distance.cuh
index 8197b7c616..bb3c5bc121 100644
--- a/cpp/src_prims/sparse/distance.cuh
+++ b/cpp/src_prims/sparse/distance/distance.cuh
@@ -17,23 +17,25 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
 
-#include <common/device_buffer.hpp>
-
+#include <sparse/linalg/transpose.h>
 #include <sparse/utils.h>
+#include <sparse/convert/coo.cuh>
+#include <sparse/convert/csr.cuh>
+#include <sparse/convert/dense.cuh>
 #include <sparse/csr.cuh>
 
-#include <cuml/common/cuml_allocator.hpp>
-#include <cuml/neighbors/knn.hpp>
-
 #include <cusparse_v2.h>
 
-namespace MLCommon {
-namespace Sparse {
-namespace Distance {
+namespace raft {
+namespace sparse {
+namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
@@ -55,14 +57,14 @@ struct distances_config_t {
 
   cusparseHandle_t handle;
 
-  std::shared_ptr<deviceAllocator> allocator;
+  std::shared_ptr<raft::mr::device::allocator> allocator;
   cudaStream_t stream;
 };
 
 template <typename value_t>
 class distances_t {
  public:
-  virtual void compute(value_t *out) { CUML_LOG_DEBUG("INside base"); }
+  virtual void compute(value_t *out) {}
   virtual ~distances_t() = default;
 };
 
@@ -105,12 +107,12 @@ class ip_distances_t : public distances_t<value_t> {
 	   * Compute pairwise distances and return dense matrix in column-major format
 	   */
 
-    CUML_LOG_DEBUG("Compute() inside inner-product d");
-    device_buffer<value_idx> out_batch_indptr(config_.allocator, config_.stream,
-                                              config_.a_nrows + 1);
-    device_buffer<value_idx> out_batch_indices(config_.allocator,
-                                               config_.stream, 0);
-    device_buffer<value_t> out_batch_data(config_.allocator, config_.stream, 0);
+    raft::mr::device::buffer<value_idx> out_batch_indptr(
+      config_.allocator, config_.stream, config_.a_nrows + 1);
+    raft::mr::device::buffer<value_idx> out_batch_indices(config_.allocator,
+                                                          config_.stream, 0);
+    raft::mr::device::buffer<value_t> out_batch_data(config_.allocator,
+                                                     config_.stream, 0);
 
     value_idx out_batch_nnz = get_nnz(out_batch_indptr.data());
 
@@ -126,10 +128,10 @@ class ip_distances_t : public distances_t<value_t> {
      * It would be nice if there was a gemm that could do
      * (sparse, sparse)->dense natively.
      */
-    csr_to_dense(config_.handle, config_.a_nrows, config_.b_nrows,
-                 out_batch_indptr.data(), out_batch_indices.data(),
-                 out_batch_data.data(), config_.a_nrows, out_distances,
-                 config_.stream, true);
+    convert::csr_to_dense(config_.handle, config_.a_nrows, config_.b_nrows,
+                          out_batch_indptr.data(), out_batch_indices.data(),
+                          out_batch_data.data(), config_.a_nrows, out_distances,
+                          config_.stream, true);
   }
 
   value_idx *trans_indptr() { return csc_indptr.data(); }
@@ -197,17 +199,14 @@ class ip_distances_t : public distances_t<value_t> {
     /**
      * Transpose index array into csc
      */
-    CUML_LOG_DEBUG("Transposing index CSR. rows=%d, cols=%d, nnz=%d",
-                   config_.b_nrows, config_.b_ncols, config_.b_nnz);
-
     csc_indptr.resize(config_.b_ncols + 1, config_.stream);
     csc_indices.resize(config_.b_nnz, config_.stream);
     csc_data.resize(config_.b_nnz, config_.stream);
 
-    csr_transpose(config_.handle, config_.b_indptr, config_.b_indices,
-                  config_.b_data, csc_indptr.data(), csc_indices.data(),
-                  csc_data.data(), config_.b_nrows, config_.b_ncols,
-                  config_.b_nnz, config_.allocator, config_.stream);
+    linalg::csr_transpose(config_.handle, config_.b_indptr, config_.b_indices,
+                          config_.b_data, csc_indptr.data(), csc_indices.data(),
+                          csc_data.data(), config_.b_nrows, config_.b_ncols,
+                          config_.b_nnz, config_.allocator, config_.stream);
   }
 
   value_t alpha;
@@ -217,10 +216,10 @@ class ip_distances_t : public distances_t<value_t> {
   cusparseMatDescr_t matC;
   cusparseMatDescr_t matD;
   cusparsePointerMode_t orig_ptr_mode;
-  device_buffer<char> workspace;
-  device_buffer<value_idx> csc_indptr;
-  device_buffer<value_idx> csc_indices;
-  device_buffer<value_t> csc_data;
+  raft::mr::device::buffer<char> workspace;
+  raft::mr::device::buffer<value_idx> csc_indptr;
+  raft::mr::device::buffer<value_idx> csc_indices;
+  raft::mr::device::buffer<value_t> csc_data;
   distances_config_t<value_idx, value_t> config_;
 };
 
@@ -275,10 +274,11 @@ void compute_l2(value_t *out, const value_idx *Q_coo_rows,
                 const value_t *Q_data, value_idx Q_nnz,
                 const value_idx *R_coo_rows, const value_t *R_data,
                 value_idx R_nnz, value_idx m, value_idx n,
-                cusparseHandle_t handle, std::shared_ptr<deviceAllocator> alloc,
+                cusparseHandle_t handle,
+                std::shared_ptr<raft::mr::device::allocator> alloc,
                 cudaStream_t stream) {
-  device_buffer<value_t> Q_sq_norms(alloc, stream, m);
-  device_buffer<value_t> R_sq_norms(alloc, stream, n);
+  raft::mr::device::buffer<value_t> Q_sq_norms(alloc, stream, m);
+  raft::mr::device::buffer<value_t> R_sq_norms(alloc, stream, n);
   CUDA_CHECK(
     cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
   CUDA_CHECK(
@@ -305,33 +305,26 @@ class l2_distances_t : public distances_t<value_t> {
       ip_dists(config) {}
 
   void compute(value_t *out_dists) {
-    CUML_LOG_DEBUG("Computing inner products");
     ip_dists.compute(out_dists);
 
     value_idx *b_indices = ip_dists.trans_indices();
     value_t *b_data = ip_dists.trans_data();
 
-    CUML_LOG_DEBUG("Computing COO row index array");
-    device_buffer<value_idx> search_coo_rows(config_.allocator, config_.stream,
-                                             config_.a_nnz);
-    csr_to_coo(config_.a_indptr, config_.a_nrows, search_coo_rows.data(),
-               config_.a_nnz, config_.stream);
-
-    CUML_LOG_DEBUG("Done.");
-
-    CUML_LOG_DEBUG("Computing L2");
+    raft::mr::device::buffer<value_idx> search_coo_rows(
+      config_.allocator, config_.stream, config_.a_nnz);
+    convert::csr_to_coo(config_.a_indptr, config_.a_nrows,
+                        search_coo_rows.data(), config_.a_nnz, config_.stream);
     compute_l2(out_dists, search_coo_rows.data(), config_.a_data, config_.a_nnz,
                b_indices, b_data, config_.b_nnz, config_.a_nrows,
                config_.b_nrows, config_.handle, config_.allocator,
                config_.stream);
-    CUML_LOG_DEBUG("Done.");
   }
 
   ~l2_distances_t() = default;
 
  private:
   distances_config_t<value_idx, value_t> config_;
-  device_buffer<char> workspace;
+  raft::mr::device::buffer<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -353,8 +346,6 @@ template <typename value_idx = int, typename value_t = float>
 void pairwiseDistance(value_t *out,
                       distances_config_t<value_idx, value_t> input_config,
                       raft::distance::DistanceType metric) {
-  CUML_LOG_DEBUG("Running sparse pairwise distances with metric=%d", metric);
-
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
       // EucExpandedL2
@@ -369,6 +360,6 @@ void pairwiseDistance(value_t *out,
   }
 }
 
-};  // END namespace Distance
-};  // END namespace Sparse
-};  // END namespace MLCommon
+};  // namespace distance
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/linalg/add.cuh b/cpp/src_prims/sparse/linalg/add.cuh
new file mode 100644
index 0000000000..949aa10a6e
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/add.cuh
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+template <typename T, int TPB_X = 32>
+__global__ void csr_add_calc_row_counts_kernel(
+  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
+  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
+  int *out_rowcounts) {
+  // loop through columns in each set of rows and
+  // calculate number of unique cols across both rows
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int a_start_idx = a_ind[row];
+    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+
+    int b_start_idx = b_ind[row];
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+
+    /**
+         * Union of columns within each row of A and B so that we can scan through
+         * them, adding their values together.
+         */
+    int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
+
+    int *arr = new int[max_size];
+    int cur_arr_idx = 0;
+    for (int j = a_start_idx; j < a_stop_idx; j++) {
+      arr[cur_arr_idx] = a_indptr[j];
+      cur_arr_idx++;
+    }
+
+    int arr_size = cur_arr_idx;
+    int final_size = arr_size;
+
+    for (int j = b_start_idx; j < b_stop_idx; j++) {
+      int cur_col = b_indptr[j];
+      bool found = false;
+      for (int k = 0; k < arr_size; k++) {
+        if (arr[k] == cur_col) {
+          found = true;
+          break;
+        }
+      }
+
+      if (!found) {
+        final_size++;
+      }
+    }
+
+    out_rowcounts[row] = final_size;
+    raft::myAtomicAdd(out_rowcounts + m, final_size);
+
+    delete arr;
+  }
+}
+
+template <typename T, int TPB_X = 32>
+__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
+                               const T *a_val, int nnz1, const int *b_ind,
+                               const int *b_indptr, const T *b_val, int nnz2,
+                               int m, int *out_ind, int *out_indptr,
+                               T *out_val) {
+  // 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int a_start_idx = a_ind[row];
+
+    // TODO: Shouldn't need this if rowind is proper CSR
+    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+
+    int b_start_idx = b_ind[row];
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+
+    int o_idx = out_ind[row];
+
+    int cur_o_idx = o_idx;
+    for (int j = a_start_idx; j < a_stop_idx; j++) {
+      out_indptr[cur_o_idx] = a_indptr[j];
+      out_val[cur_o_idx] = a_val[j];
+      cur_o_idx++;
+    }
+
+    int arr_size = cur_o_idx - o_idx;
+    for (int j = b_start_idx; j < b_stop_idx; j++) {
+      int cur_col = b_indptr[j];
+      bool found = false;
+      for (int k = o_idx; k < o_idx + arr_size; k++) {
+        // If we found a match, sum the two values
+        if (out_indptr[k] == cur_col) {
+          out_val[k] += b_val[j];
+          found = true;
+          break;
+        }
+      }
+
+      // if we didn't find a match, add the value for b
+      if (!found) {
+        out_indptr[o_idx + arr_size] = cur_col;
+        out_val[o_idx + arr_size] = b_val[j];
+        arr_size++;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param out_ind: output row_ind array
+ * @param d_alloc: device allocator to use for temp memory
+ * @param stream: cuda stream to use
+ */
+template <typename T, int TPB_X = 32>
+size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
+                         int nnz1, const int *b_ind, const int *b_indptr,
+                         const T *b_val, int nnz2, int m, int *out_ind,
+                         std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                         cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+
+  csr_add_calc_row_counts_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, row_counts.data());
+
+  int cnnz = 0;
+  raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  // create csr compressed row index from row counts
+  thrust::device_ptr<int> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
+
+  return cnnz;
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param c_ind: output row_ind array
+ * @param c_indptr: output ind_ptr array
+ * @param c_val: output data array
+ * @param stream: cuda stream to use
+ */
+template <typename T, int TPB_X = 32>
+void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
+                      int nnz1, const int *b_ind, const int *b_indptr,
+                      const T *b_val, int nnz2, int m, int *c_ind,
+                      int *c_indptr, T *c_val, cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_add_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, c_ind, c_indptr, c_val);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/src_prims/sparse/linalg/degree.cuh b/cpp/src_prims/sparse/linalg/degree.cuh
new file mode 100644
index 0000000000..33abc8a522
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/degree.cuh
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <sparse/utils.h>
+#include <sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Count all the rows in the coo row array and place them in the
+ * results matrix, indexed by row.
+ *
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows the rows array of the coo matrix
+ * @param nnz the size of the rows array
+ * @param results array to place results
+ */
+template <int TPB_X>
+__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows: rows array of the COO matrix
+ * @param nnz: size of the rows array
+ * @param results: output result array
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X>
+void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(rows, nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: type name of underlying values array
+ * @param in: input COO object for counting rows
+ * @param results: output array with row counts (size=in->n_rows)
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_kernel<TPB_X>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <int TPB_X, typename T>
+__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
+                                     int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz && vals[row] != 0.0) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+template <int TPB_X, typename T>
+__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
+                                         int nnz, T scalar, int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz && vals[row] != scalar) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+/**
+ * @brief Count the number of values for each row matching a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_degree_scalar(COO<T> *in, T scalar, int *results,
+                       cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
+    in->rows(), in->vals(), in->nnz, scalar, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of values for each row matching a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
+                       int *results, cudaStream_t stream = 0) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_scalar_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, scalar, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of nonzeros for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
+                   cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_nz_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of nonzero values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_nz_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/linalg/norm.cuh b/cpp/src_prims/sparse/linalg/norm.cuh
new file mode 100644
index 0000000000..939591db13
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/norm.cuh
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <limits>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+template <int TPB_X, typename T>
+__global__ void csr_row_normalize_l1_kernel(
+  // @TODO: This can be done much more parallel by
+  // having threads in a warp compute the sum in parallel
+  // over each row and then divide the values in parallel.
+  const int *ia,           // csr row ex_scan (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num rows in csr
+  T *result) {             // output array
+
+  // row-based matrix 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  // sum all vals_arr for row and divide each val by sum
+  if (row < m) {
+    int start_idx = ia[row];
+    int stop_idx = 0;
+    if (row < m - 1) {
+      stop_idx = ia[row + 1];
+    } else
+      stop_idx = nnz;
+
+    T sum = T(0.0);
+    for (int j = start_idx; j < stop_idx; j++) {
+      sum = sum + fabs(vals[j]);
+    }
+
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (sum != 0.0) {
+        T val = vals[j];
+        result[j] = val / sum;
+      } else {
+        result[j] = 0.0;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 32, typename T>
+void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
+                          const T *vals,
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
+                          T *result,
+                          cudaStream_t stream) {  // output array
+
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_row_normalize_l1_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <int TPB_X = 32, typename T>
+__global__ void csr_row_normalize_max_kernel(
+  // @TODO: This can be done much more parallel by
+  // having threads in a warp compute the sum in parallel
+  // over each row and then divide the values in parallel.
+  const int *ia,           // csr row ind array (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num total rows in csr
+  T *result) {             // output array
+
+  // row-based matrix 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  // find max across columns and divide
+  if (row < m) {
+    int start_idx = ia[row];
+    int stop_idx = 0;
+    if (row < m - 1) {
+      stop_idx = ia[row + 1];
+    } else
+      stop_idx = nnz;
+
+    T max = std::numeric_limits<float>::min();
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (vals[j] > max) max = vals[j];
+    }
+
+    // divide nonzeros in current row by max
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (max != 0.0 && max > std::numeric_limits<float>::min()) {
+        T val = vals[j];
+        result[j] = val / max;
+      } else {
+        result[j] = 0.0;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+
+template <int TPB_X = 32, typename T>
+void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
+                           const T *vals,
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
+                           T *result, cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_row_normalize_max_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/linalg/spectral.cuh b/cpp/src_prims/sparse/linalg/spectral.cuh
new file mode 100644
index 0000000000..74c2a27bb1
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/spectral.cuh
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include <raft/spectral/partition.hpp>
+
+#include <selection/knn.cuh>
+#include <sparse/convert/csr.cuh>
+#include <sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace spectral {
+
+template <typename T>
+void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
+                   int nnz, int n, int n_components, T *out) {
+  auto stream = handle.get_stream();
+  auto d_alloc = handle.get_device_allocator();
+  raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
+  raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
+  raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
+  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
+                      dst_cols.data(), dst_vals.data());
+
+  raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
+  raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
+  raft::mr::device::buffer<int> labels(d_alloc, stream, n);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  /**
+   * Raft spectral clustering
+   */
+  using index_type = int;
+  using value_type = T;
+
+  index_type *ro = src_offsets.data();
+  index_type *ci = dst_cols.data();
+  value_type *vs = dst_vals.data();
+
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
+
+  index_type neigvs = n_components + 1;
+  index_type maxiter = 4000;  //default reset value (when set to 0);
+  value_type tol = 0.01;
+  index_type restart_iter = 15 + neigvs;  //what cugraph is using
+  auto t_exe_p = thrust::cuda::par.on(stream);
+  using thrust_exe_policy_t = decltype(t_exe_p);
+
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
+                                                          restart_iter, tol};
+
+  raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  //cluster computation here is irrelevant,
+  //hence define a no-op such solver to
+  //feed partition():
+  //
+  struct no_op_cluster_solver_t {
+    using index_type_t = index_type;
+    using size_type_t = index_type;
+    using value_type_t = value_type;
+
+    std::pair<value_type_t, index_type_t> solve(
+      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
+      size_type_t n_obs_vecs, size_type_t dim,
+      value_type_t const *__restrict__ obs,
+      index_type_t *__restrict__ codes) const {
+      return std::make_pair<value_type_t, index_type_t>(0, 0);
+    }
+  };
+
+  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
+                            no_op_cluster_solver_t{}, labels.data(),
+                            eigVals.data(), eigVecs.data());
+
+  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
+
+  CUDA_CHECK(cudaGetLastError());
+}
+};  // namespace spectral
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/linalg/symmetrize.cuh b/cpp/src_prims/sparse/linalg/symmetrize.cuh
new file mode 100644
index 0000000000..7f07f9404d
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/symmetrize.cuh
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/device_atomics.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+#include <sparse/convert/csr.cuh>
+#include <sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+// TODO: value_idx param needs to be used for this once FAISS is updated to use float32
+// for indices so that the index types can be uniform
+template <int TPB_X, typename T, typename Lambda>
+__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
+                                      T *vals, int *orows, int *ocols, T *ovals,
+                                      int n, int cnnz, Lambda reduction_op) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < n) {
+    int start_idx = row_ind[row];  // each thread processes one row
+    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
+
+    int row_nnz = 0;
+    int out_start_idx = start_idx * 2;
+
+    for (int idx = 0; idx < stop_idx - start_idx; idx++) {
+      int cur_row = rows[idx + start_idx];
+      int cur_col = cols[idx + start_idx];
+      T cur_val = vals[idx + start_idx];
+
+      int lookup_row = cur_col;
+      int t_start = row_ind[lookup_row];  // Start at
+      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
+
+      T transpose = 0.0;
+
+      bool found_match = false;
+      for (int t_idx = t_start; t_idx < t_stop; t_idx++) {
+        // If we find a match, let's get out of the loop. We won't
+        // need to modify the transposed value, since that will be
+        // done in a different thread.
+        if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
+          // If it exists already, set transposed value to existing value
+          transpose = vals[t_idx];
+          found_match = true;
+          break;
+        }
+      }
+
+      // Custom reduction op on value and its transpose, which enables
+      // specialized weighting.
+      // If only simple X+X.T is desired, this op can just sum
+      // the two values.
+      T res = reduction_op(cur_row, cur_col, cur_val, transpose);
+
+      // if we didn't find an exact match, we need to add
+      // the computed res into our current matrix to guarantee
+      // symmetry.
+      // Note that if we did find a match, we don't need to
+      // compute `res` on it here because it will be computed
+      // in a different thread.
+      if (!found_match && vals[idx] != 0.0) {
+        orows[out_start_idx + row_nnz] = cur_col;
+        ocols[out_start_idx + row_nnz] = cur_row;
+        ovals[out_start_idx + row_nnz] = res;
+        ++row_nnz;
+      }
+
+      if (res != 0.0) {
+        orows[out_start_idx + row_nnz] = cur_row;
+        ocols[out_start_idx + row_nnz] = cur_col;
+        ovals[out_start_idx + row_nnz] = res;
+        ++row_nnz;
+      }
+    }
+  }
+}
+
+/**
+ * @brief takes a COO matrix which may not be symmetric and symmetrizes
+ * it, running a custom reduction function against the each value
+ * and its transposed value.
+ *
+ * @param in: Input COO matrix
+ * @param out: Output symmetrized COO matrix
+ * @param reduction_op: a custom reduction function
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T, typename Lambda>
+void coo_symmetrize(COO<T> *in, COO<T> *out,
+                    Lambda reduction_op,  // two-argument reducer
+                    std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                    cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
+
+  raft::mr::device::buffer<int> in_row_ind(d_alloc, stream, in->n_rows);
+
+  convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
+
+  out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
+
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
+    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
+    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Find how much space needed in each row.
+ * We look through all datapoints and increment the count for each row.
+ *
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input empty row sum 1 array(n)
+ * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_find_size(const value_t *restrict data,
+                                           const value_idx *restrict indices,
+                                           const value_idx n, const int k,
+                                           value_idx *restrict row_sizes,
+                                           value_idx *restrict row_sizes2) {
+  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  if (row >= n || j >= k) return;
+
+  const auto col = indices[row * k + j];
+  if (j % 2)
+    atomicAdd(&row_sizes[col], value_idx(1));
+  else
+    atomicAdd(&row_sizes2[col], value_idx(1));
+}
+
+/**
+ * @brief Reduce sum(row_sizes) + k
+ * Reduction for symmetric_find_size kernel. Allows algo to be faster.
+ *
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input row sum 1 array(n)
+ * @param row_sizes2: Input row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx>
+__global__ static void reduce_find_size(const value_idx n, const int k,
+                                        value_idx *restrict row_sizes,
+                                        const value_idx *restrict row_sizes2) {
+  const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (i >= n) return;
+  row_sizes[i] += (row_sizes2[i] + k);
+}
+
+/**
+ * @brief Perform data + data.T operation.
+ * Can only run once row_sizes from the CSR matrix of data + data.T has been
+ * determined.
+ *
+ * @param edges: Input row sum array(n) after reduction
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param VAL: Output values for data + data.T
+ * @param COL: Output column indices for data + data.T
+ * @param ROW: Output row indices for data + data.T
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_sum(value_idx *restrict edges,
+                                     const value_t *restrict data,
+                                     const value_idx *restrict indices,
+                                     value_t *restrict VAL,
+                                     value_idx *restrict COL,
+                                     value_idx *restrict ROW, const value_idx n,
+                                     const int k) {
+  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  if (row >= n || j >= k) return;
+
+  const auto col = indices[row * k + j];
+  const auto original = atomicAdd(&edges[row], value_idx(1));
+  const auto transpose = atomicAdd(&edges[col], value_idx(1));
+
+  VAL[transpose] = VAL[original] = data[row * k + j];
+  // Notice swapped ROW, COL since transpose
+  ROW[original] = row;
+  COL[original] = col;
+
+  ROW[transpose] = col;
+  COL[transpose] = row;
+}
+
+/**
+ * @brief Perform data + data.T on raw KNN data.
+ * The following steps are invoked:
+ * (1) Find how much space needed in each row
+ * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+ * (3) Allocate new space
+ * (4) Prepare edges for each new row
+ * (5) Perform final data + data.T operation
+ * (6) Return summed up VAL, COL, ROW
+ *
+ * @param knn_indices: Input knn distances(n, k)
+ * @param knn_dists: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param out: Output COO Matrix class
+ * @param stream: Input cuda stream
+ * @param d_alloc device allocator for temporary buffers
+ */
+template <typename value_idx = int64_t, typename value_t = float,
+          int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(
+  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
+  const value_idx n, const int k, COO<value_t, value_idx> *out,
+  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
+  // (1) Find how much space needed in each row
+  // We look through all datapoints and increment the count for each row.
+  const dim3 threadsPerBlock(TPB_X, TPB_Y);
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
+                       raft::ceildiv(k, TPB_Y));
+
+  // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
+  raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+
+  raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+
+  symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
+    knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  reduce_find_size<<<raft::ceildiv(n, (value_idx)1024), 1024, 0, stream>>>(
+    n, k, row_sizes.data(), row_sizes2.data());
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+  // Notice we don't do any merging and leave the result as 2*NNZ
+  const auto NNZ = 2 * n * k;
+
+  // (3) Allocate new space
+  out->allocate(NNZ, n, n, true, stream);
+
+  // (4) Prepare edges for each new row
+  // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
+  // are calculated as the cumulative rolling sum of the previous rows.
+  // Notice reusing old row_sizes2 memory
+  value_idx *edges = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes =
+    thrust::device_pointer_cast(row_sizes.data());
+
+  // Rolling cumulative sum
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
+                         __row_sizes + n, __edges);
+
+  // (5) Perform final data + data.T operation in tandem with memcpying
+  symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
+    edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/linalg/transpose.h b/cpp/src_prims/sparse/linalg/transpose.h
new file mode 100644
index 0000000000..551b828407
--- /dev/null
+++ b/cpp/src_prims/sparse/linalg/transpose.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * Transpose a set of CSR arrays into a set of CSC arrays.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR data array
+ * @param[in] handle : used for invoking cusparse
+ * @param[in] csr_indptr : CSR row index array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[out] csc_indptr : CSC row index array
+ * @param[out] csc_indices : CSC column indices array
+ * @param[out] csc_data : CSC data array
+ * @param[in] csr_nrows : Number of rows in CSR
+ * @param[in] csr_ncols : Number of columns in CSR
+ * @param[in] nnz : Number of nonzeros of CSR
+ * @param[in] allocator : Allocator for intermediate memory
+ * @param[in] stream : Cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
+                   const value_idx *csr_indices, const value_t *csr_data,
+                   value_idx *csc_indptr, value_idx *csc_indices,
+                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
+                   value_idx nnz,
+                   std::shared_ptr<raft::mr::device::allocator> allocator,
+                   cudaStream_t stream) {
+  size_t convert_csc_workspace_size = 0;
+
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    &convert_csc_workspace_size, stream));
+
+  raft::mr::device::buffer<char> convert_csc_workspace(
+    allocator, stream, convert_csc_workspace_size);
+
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    convert_csc_workspace.data(), stream));
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/op/filter.cuh b/cpp/src_prims/sparse/op/filter.cuh
new file mode 100644
index 0000000000..4402cfd9e1
--- /dev/null
+++ b/cpp/src_prims/sparse/op/filter.cuh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+#include <sparse/coo.cuh>
+#include <sparse/linalg/degree.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+template <int TPB_X, typename T>
+__global__ void coo_remove_zeros_kernel(const int *rows, const int *cols,
+                                        const T *vals, int nnz, int *crows,
+                                        int *ccols, T *cvals, int *ex_scan,
+                                        int *cur_ex_scan, int m) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int start = cur_ex_scan[row];
+    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int cur_out_idx = ex_scan[row];
+
+    for (int idx = start; idx < stop; idx++) {
+      if (vals[idx] != 0.0) {
+        crows[cur_out_idx] = rows[idx];
+        ccols[cur_out_idx] = cols[idx];
+        cvals[cur_out_idx] = vals[idx];
+        ++cur_out_idx;
+      }
+    }
+  }
+}
+
+template <int TPB_X, typename T>
+__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
+                                         const T *vals, int nnz, int *crows,
+                                         int *ccols, T *cvals, int *ex_scan,
+                                         int *cur_ex_scan, int m, T scalar) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int start = cur_ex_scan[row];
+    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int cur_out_idx = ex_scan[row];
+
+    for (int idx = start; idx < stop; idx++) {
+      if (vals[idx] != scalar) {
+        crows[cur_out_idx] = rows[idx];
+        ccols[cur_out_idx] = cols[idx];
+        cvals[cur_out_idx] = vals[idx];
+        ++cur_out_idx;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param rows: input array of rows (size n)
+ * @param cols: input array of cols (size n)
+ * @param vals: input array of vals (size n)
+ * @param nnz: size of current rows/cols/vals arrays
+ * @param crows: compressed array of rows
+ * @param ccols: compressed array of cols
+ * @param cvals: compressed array of vals
+ * @param cnnz: array of non-zero counts per row
+ * @param cur_cnnz array of counts per row
+ * @param scalar: scalar to remove from arrays
+ * @param n: number of rows in dense matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
+                       int *crows, int *ccols, T *cvals, int *cnnz,
+                       int *cur_cnnz, T scalar, int n,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
+  raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
+
+  CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
+
+  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan =
+    thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
+                         dev_ex_scan);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan =
+    thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
+                         dev_cur_cnnz + n, dev_cur_ex_scan);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
+    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
+    dev_cur_ex_scan.get(), n, scalar);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param scalar: scalar to remove from arrays
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
+  raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
+
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+
+  linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
+                                   row_count_nz.data(), stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  thrust::device_ptr<int> d_row_count_nz =
+    thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
+                               d_row_count_nz + in->n_rows);
+
+  out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
+
+  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
+                              out->rows(), out->cols(), out->vals(),
+                              row_count_nz.data(), row_count.data(), scalar,
+                              in->n_rows, d_alloc, stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Removes zeros from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_zeros(COO<T> *in, COO<T> *out,
+                      std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                      cudaStream_t stream) {
+  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/op/row_op.cuh b/cpp/src_prims/sparse/op/row_op.cuh
new file mode 100644
index 0000000000..5ba9600891
--- /dev/null
+++ b/cpp/src_prims/sparse/op/row_op.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+template <typename T, int TPB_X = 32, typename Lambda = auto(T, T, T)->void>
+__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
+                                  Lambda op) {
+  T row = blockIdx.x * TPB_X + threadIdx.x;
+  if (row < n_rows) {
+    T start_idx = row_ind[row];
+    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    op(row, start_idx, stop_idx);
+  }
+}
+
+/**
+ * @brief Perform a custom row operation on a CSR matrix in batches.
+ * @tparam T numerical type of row_ind array
+ * @tparam TPB_X number of threads per block to use for underlying kernel
+ * @tparam Lambda type of custom operation function
+ * @param row_ind the CSR row_ind array to perform parallel operations over
+ * @param n_rows total number vertices in graph
+ * @param nnz number of non-zeros
+ * @param op custom row operation functor accepting the row and beginning index.
+ * @param stream cuda stream to use
+ */
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
+                cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+  csr_row_op_kernel<Index_, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/src_prims/sparse/op/slice.h b/cpp/src_prims/sparse/op/slice.h
new file mode 100644
index 0000000000..5fa54ed1b5
--- /dev/null
+++ b/cpp/src_prims/sparse/op/slice.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * Slice consecutive rows from a CSR array and populate newly sliced indptr array
+ * @tparam value_idx
+ * @param[in] start_row : beginning row to slice
+ * @param[in] stop_row : ending row to slice
+ * @param[in] indptr : indptr of input CSR to slice
+ * @param[out] indptr_out : output sliced indptr to populate
+ * @param[in] start_offset : beginning column offset of input indptr
+ * @param[in] stop_offset : ending column offset of input indptr
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx>
+void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
+                          const value_idx *indptr, value_idx *indptr_out,
+                          value_idx *start_offset, value_idx *stop_offset,
+                          cudaStream_t stream) {
+  raft::update_host(start_offset, indptr + start_row, 1, stream);
+  raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  value_idx s_offset = *start_offset;
+
+  // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
+  // we add another 1 to stop row.
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
+                   stream);
+
+  raft::linalg::unaryOp<value_idx>(
+    indptr_out, indptr_out, (stop_row + 2) - start_row,
+    [s_offset] __device__(value_idx input) { return input - s_offset; },
+    stream);
+}
+
+/**
+ * Slice rows from a CSR, populate column and data arrays
+ * @tparam[in] value_idx : data type of CSR index arrays
+ * @tparam[in] value_t : data type of CSR data array
+ * @param[in] start_offset : beginning column offset to slice
+ * @param[in] stop_offset : ending column offset to slice
+ * @param[in] indices : column indices array from input CSR
+ * @param[in] data : data array from input CSR
+ * @param[out] indices_out : output column indices array
+ * @param[out] data_out : output data array
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
+                            const value_idx *indices, const value_t *data,
+                            value_idx *indices_out, value_t *data_out,
+                            cudaStream_t stream) {
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
+             stream);
+  raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/op/sort.h b/cpp/src_prims/sparse/op/sort.h
new file mode 100644
index 0000000000..277ee61b91
--- /dev/null
+++ b/cpp/src_prims/sparse/op/sort.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <sparse/utils.h>
+#include <sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row.
+ *
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
+              std::shared_ptr<raft::mr::device::allocator> d_alloc,
+              cudaStream_t stream) {
+  cusparseHandle_t handle = NULL;
+
+  size_t pBufferSizeInBytes = 0;
+
+  CUSPARSE_CHECK(cusparseCreate(&handle));
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, rows, cols,
+                                                &pBufferSizeInBytes));
+
+  raft::mr::device::buffer<int> d_P(d_alloc, stream, nnz);
+  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, pBufferSizeInBytes);
+
+  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, d_P.data()));
+
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, rows, cols,
+                                       d_P.data(), pBuffer.data()));
+
+  raft::mr::device::buffer<T> vals_sorted(d_alloc, stream, nnz);
+
+  CUSPARSE_CHECK(raft::sparse::cusparsegthr<T>(
+    handle, nnz, vals, vals_sorted.data(), d_P.data(), stream));
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  raft::copy(vals, vals_sorted.data(), nnz, stream);
+
+  CUSPARSE_CHECK(cusparseDestroy(handle));
+}
+
+/**
+ * @brief Sort the underlying COO arrays by row
+ * @tparam T: the type name of the underlying value array
+ * @param in: COO to sort by row
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: the cuda stream to use
+ */
+template <typename T>
+void coo_sort(COO<T> *const in,
+              std::shared_ptr<raft::mr::device::allocator> d_alloc,
+              cudaStream_t stream) {
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
+              in->vals(), d_alloc, stream);
+}
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/knn.cuh b/cpp/src_prims/sparse/selection/knn.cuh
similarity index 84%
rename from cpp/src_prims/sparse/knn.cuh
rename to cpp/src_prims/sparse/selection/knn.cuh
index 69ebcf576b..2a56fe7253 100644
--- a/cpp/src_prims/sparse/knn.cuh
+++ b/cpp/src_prims/sparse/selection/knn.cuh
@@ -14,36 +14,30 @@
  * limitations under the License.
  */
 
-#include <matrix/reverse.cuh>
-#include <raft/matrix/matrix.cuh>
-
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
 
+#include <sparse/op/slice.h>
+#include <sparse/utils.h>
 #include <selection/knn.cuh>
 #include <sparse/coo.cuh>
 #include <sparse/csr.cuh>
-#include <sparse/distance.cuh>
-#include <sparse/selection.cuh>
-
-#include <raft/linalg/distance_type.h>
-
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-
-#include <raft/cuda_utils.cuh>
-
-#include <raft/sparse/cusparse_wrappers.h>
+#include <sparse/distance/distance.cuh>
+#include <sparse/selection/selection.cuh>
 
 #include <cusparse_v2.h>
 
-#include <sparse/utils.h>
-
 #pragma once
 
-namespace MLCommon {
-namespace Sparse {
-namespace Selection {
+namespace raft {
+namespace sparse {
+namespace selection {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
@@ -69,27 +63,20 @@ struct csr_batcher_t {
       batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
     batch_rows_ = (batch_stop_ - batch_start_) + 1;
-
-    CUML_LOG_DEBUG(
-      "Setting batch. batch_start=%d, batch_stop=%d, batch_rows=%d",
-      batch_start_, batch_stop_, batch_rows_);
   }
 
   value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
                                      cudaStream_t stream) {
-    MLCommon::Sparse::csr_row_slice_indptr(
+    raft::sparse::op::csr_row_slice_indptr(
       batch_start_, batch_stop_, csr_indptr_, batch_indptr,
       &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
 
-    CUML_LOG_DEBUG("Computed batch offsets. stop_offset=%d, start_offset=%d",
-                   batch_csr_stop_offset_, batch_csr_start_offset_);
-
     return batch_csr_stop_offset_ - batch_csr_start_offset_;
   }
 
   void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
                                   cudaStream_t stream) {
-    MLCommon::Sparse::csr_row_slice_populate(
+    raft::sparse::op::csr_row_slice_populate(
       batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
       csr_indices, csr_data, stream);
   }
@@ -126,7 +113,7 @@ class sparse_knn_t {
                size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
                value_idx *output_indices_, value_t *output_dists_, int k_,
                cusparseHandle_t cusparseHandle_,
-               std::shared_ptr<deviceAllocator> allocator_,
+               std::shared_ptr<raft::mr::device::allocator> allocator_,
                cudaStream_t stream_,
                size_t batch_size_index_ = 2 << 14,  // approx 1M
                size_t batch_size_query_ = 2 << 14,
@@ -159,7 +146,6 @@ class sparse_knn_t {
   void run() {
     using namespace raft::sparse;
 
-    CUML_LOG_DEBUG("n_query_rows=%d, n_idx_rows=%d", n_query_rows, n_idx_rows);
     int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
     csr_batcher_t<value_idx, value_t> query_batcher(
       batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
@@ -168,35 +154,34 @@ class sparse_knn_t {
 
     for (int i = 0; i < n_batches_query; i++) {
       /**
-            * Compute index batch info
-            */
-      CUML_LOG_DEBUG("Beginning index batch %d", i);
+        * Compute index batch info
+        */
       query_batcher.set_batch(i);
 
       /**
-            * Slice CSR to rows in batch
-            */
-      CUML_LOG_DEBUG("Slicing query CSR for batch. rows=%d out of %d",
-                     query_batcher.batch_rows(), n_query_rows);
+        * Slice CSR to rows in batch
+        */
 
-      device_buffer<value_idx> query_batch_indptr(
+      raft::mr::device::buffer<value_idx> query_batch_indptr(
         allocator, stream, query_batcher.batch_rows() + 1);
 
       value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
         query_batch_indptr.data(), stream);
 
-      device_buffer<value_idx> query_batch_indices(allocator, stream,
-                                                   n_query_batch_nnz);
-      device_buffer<value_t> query_batch_data(allocator, stream,
-                                              n_query_batch_nnz);
+      raft::mr::device::buffer<value_idx> query_batch_indices(
+        allocator, stream, n_query_batch_nnz);
+      raft::mr::device::buffer<value_t> query_batch_data(allocator, stream,
+                                                         n_query_batch_nnz);
 
       query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
                                                query_batch_data.data(), stream);
 
       // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
       // batches and 1 space for the results of the merge, which get copied back to the top
-      device_buffer<value_idx> merge_buffer_indices(allocator, stream, 0);
-      device_buffer<value_t> merge_buffer_dists(allocator, stream, 0);
+      raft::mr::device::buffer<value_idx> merge_buffer_indices(allocator,
+                                                               stream, 0);
+      raft::mr::device::buffer<value_t> merge_buffer_dists(allocator, stream,
+                                                           0);
 
       value_t *dists_merge_buffer_ptr;
       value_idx *indices_merge_buffer_ptr;
@@ -206,21 +191,19 @@ class sparse_knn_t {
         batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
 
       for (int j = 0; j < n_batches_idx; j++) {
-        CUML_LOG_DEBUG("Beginning query batch %d", j);
         idx_batcher.set_batch(j);
 
         merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, stream);
         merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, stream);
 
         /**
-              * Slice CSR to rows in batch
-            */
-        CUML_LOG_DEBUG("Slicing index CSR for batch. rows=%d out of %d",
-                       idx_batcher.batch_rows(), n_idx_rows);
-        device_buffer<value_idx> idx_batch_indptr(allocator, stream,
-                                                  idx_batcher.batch_rows() + 1);
-        device_buffer<value_idx> idx_batch_indices(allocator, stream, 0);
-        device_buffer<value_t> idx_batch_data(allocator, stream, 0);
+          * Slice CSR to rows in batch
+        */
+        raft::mr::device::buffer<value_idx> idx_batch_indptr(
+          allocator, stream, idx_batcher.batch_rows() + 1);
+        raft::mr::device::buffer<value_idx> idx_batch_indices(allocator, stream,
+                                                              0);
+        raft::mr::device::buffer<value_t> idx_batch_data(allocator, stream, 0);
 
         value_idx idx_batch_nnz =
           idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), stream);
@@ -232,11 +215,12 @@ class sparse_knn_t {
                                                idx_batch_data.data(), stream);
 
         /**
-           * Compute distances
-           */
+         * Compute distances
+         */
         value_idx dense_size =
           idx_batcher.batch_rows() * query_batcher.batch_rows();
-        device_buffer<value_t> batch_dists(allocator, stream, dense_size);
+        raft::mr::device::buffer<value_t> batch_dists(allocator, stream,
+                                                      dense_size);
 
         compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
                           n_query_batch_nnz, idx_batch_indptr.data(),
@@ -249,8 +233,8 @@ class sparse_knn_t {
         idx_batch_data.release(stream);
 
         // Build batch indices array
-        device_buffer<value_idx> batch_indices(allocator, stream,
-                                               batch_dists.size());
+        raft::mr::device::buffer<value_idx> batch_indices(allocator, stream,
+                                                          batch_dists.size());
 
         // populate batch indices array
         value_idx batch_rows = query_batcher.batch_rows(),
@@ -259,9 +243,8 @@ class sparse_knn_t {
         iota_fill(batch_indices.data(), batch_rows, batch_cols, stream);
 
         /**
-             * Perform k-selection on batch & merge with other k-selections
-             */
-        CUML_LOG_DEBUG("Performing k-selection");
+         * Perform k-selection on batch & merge with other k-selections
+         */
         size_t merge_buffer_offset = batch_rows * k;
         dists_merge_buffer_ptr =
           merge_buffer_dists.data() + merge_buffer_offset;
@@ -290,8 +273,6 @@ class sparse_knn_t {
                         indices_merge_buffer_tmp_ptr);
         }
 
-        CUML_LOG_DEBUG("Performing copy async");
-
         // copy merged output back into merge buffer partition for next iteration
         raft::copy_async<value_idx>(merge_buffer_indices.data(),
                                     indices_merge_buffer_tmp_ptr,
@@ -299,8 +280,6 @@ class sparse_knn_t {
         raft::copy_async<value_t>(merge_buffer_dists.data(),
                                   dists_merge_buffer_tmp_ptr, batch_rows * k,
                                   stream);
-
-        CUML_LOG_DEBUG("Done.");
       }
 
       // Copy final merged batch to output array
@@ -346,12 +325,11 @@ class sparse_knn_t {
     id_ranges.push_back(0);
     id_ranges.push_back(idx_batcher.batch_start());
 
-    device_buffer<value_idx> trans(allocator, stream, id_ranges.size());
+    raft::mr::device::buffer<value_idx> trans(allocator, stream,
+                                              id_ranges.size());
     raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
                         stream);
 
-    CUML_LOG_DEBUG("Running merge parts");
-
     // combine merge buffers only if there's more than 1 partition to combine
     MLCommon::Selection::knn_merge_parts(
       merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
@@ -408,11 +386,9 @@ class sparse_knn_t {
                          value_idx *query_batch_indices,
                          value_t *query_batch_data, value_t *batch_dists) {
     /**
-       * Compute distances
-       */
-    CUML_LOG_DEBUG("Computing pairwise distances for batch");
-    MLCommon::Sparse::Distance::distances_config_t<value_idx, value_t>
-      dist_config;
+     * Compute distances
+     */
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
     dist_config.b_nrows = idx_batcher.batch_rows();
     dist_config.b_ncols = n_idx_cols;
     dist_config.b_nnz = idx_batch_nnz;
@@ -433,7 +409,8 @@ class sparse_knn_t {
     dist_config.allocator = allocator;
     dist_config.stream = stream;
 
-    Distance::pairwiseDistance(batch_dists, dist_config, get_pw_metric());
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config,
+                                             get_pw_metric());
   }
 
   const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
@@ -453,7 +430,7 @@ class sparse_knn_t {
 
   cusparseHandle_t cusparseHandle;
 
-  std::shared_ptr<deviceAllocator> allocator;
+  std::shared_ptr<raft::mr::device::allocator> allocator;
 
   cudaStream_t stream;
 };
@@ -492,7 +469,7 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
                      size_t queryNNZ, int n_query_rows, int n_query_cols,
                      value_idx *output_indices, value_t *output_dists, int k,
                      cusparseHandle_t cusparseHandle,
-                     std::shared_ptr<deviceAllocator> allocator,
+                     std::shared_ptr<raft::mr::device::allocator> allocator,
                      cudaStream_t stream,
                      size_t batch_size_index = 2 << 14,  // approx 1M
                      size_t batch_size_query = 2 << 14,
@@ -506,6 +483,6 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
     .run();
 }
 
-};  // END namespace Selection
-};  // END namespace Sparse
-};  // END namespace MLCommon
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/selection.cuh b/cpp/src_prims/sparse/selection/selection.cuh
similarity index 88%
rename from cpp/src_prims/sparse/selection.cuh
rename to cpp/src_prims/sparse/selection/selection.cuh
index 87c698f499..d3c2095c59 100644
--- a/cpp/src_prims/sparse/selection.cuh
+++ b/cpp/src_prims/sparse/selection/selection.cuh
@@ -16,13 +16,16 @@
 
 #pragma once
 
-#include <matrix/reverse.cuh>
+#include <selection/knn.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/matrix/matrix.cuh>
 
-#include <selection/knn.cuh>
 #include <sparse/coo.cuh>
 #include <sparse/csr.cuh>
-#include <sparse/distance.cuh>
+#include <sparse/distance/distance.cuh>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -32,35 +35,11 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
-#include <raft/cuda_utils.cuh>
-
-#include <raft/sparse/cusparse_wrappers.h>
-
 #include <cusparse_v2.h>
 
-namespace MLCommon {
-namespace Sparse {
-namespace Selection {
-
-template <typename value_idx>
-__global__ void iota_fill_warp_kernel(value_idx *indices, value_idx ncols) {
-  int row = blockIdx.x;
-  int tid = threadIdx.x;
-
-  for (int i = tid; i < ncols; i += blockDim.x) {
-    indices[row * ncols + i] = i;
-  }
-}
-
-template <typename value_idx>
-void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
-               cudaStream_t stream) {
-  int blockdim = block_dim(ncols);
-
-  iota_fill_warp_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
-}
+namespace raft {
+namespace sparse {
+namespace selection {
 
 template <typename K, typename IndexType, bool select_min, int warp_q,
           int thread_q, int tpb>
@@ -175,6 +154,6 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
                                                outV, select_min, k, stream);
 }
 
-};  // END namespace Selection
-};  // END namespace Sparse
-};  // END namespace MLCommon
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/src_prims/sparse/spectral.cuh b/cpp/src_prims/sparse/spectral.cuh
deleted file mode 100644
index fe94d52ef0..0000000000
--- a/cpp/src_prims/sparse/spectral.cuh
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-#include <raft/cuda_utils.cuh>
-#include <selection/knn.cuh>
-#include "coo.cuh"
-
-#include <raft/spectral/partition.hpp>
-
-namespace MLCommon {
-namespace Spectral {
-
-template <typename T>
-void coo2csr(cusparseHandle_t handle, const int *srcRows, const int *srcCols,
-             const T *srcVals, int nnz, int m, int *dst_offsets, int *dstCols,
-             T *dstVals, std::shared_ptr<deviceAllocator> d_alloc,
-             cudaStream_t stream) {
-  device_buffer<int> dstRows(d_alloc, stream, nnz);
-  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
-  auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
-    handle, m, m, nnz, srcRows, srcCols, stream);
-  device_buffer<char> pBuffer(d_alloc, stream, buffSize);
-  device_buffer<int> P(d_alloc, stream, nnz);
-  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(handle, m, m, nnz, dstRows.data(), dstCols,
-                                     P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(handle, nnz, srcVals, dstVals, P.data(), stream);
-  raft::sparse::cusparsecoo2csr(handle, dstRows.data(), nnz, m, dst_offsets,
-                                stream);
-  CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-template <typename T>
-void fit_embedding(cusparseHandle_t handle, int *rows, int *cols, T *vals,
-                   int nnz, int n, int n_components, T *out,
-                   std::shared_ptr<deviceAllocator> d_alloc,
-                   cudaStream_t stream) {
-  device_buffer<int> src_offsets(d_alloc, stream, n + 1);
-  device_buffer<int> dst_cols(d_alloc, stream, nnz);
-  device_buffer<T> dst_vals(d_alloc, stream, nnz);
-  coo2csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(),
-          dst_vals.data(), d_alloc, stream);
-
-  device_buffer<T> eigVals(d_alloc, stream, n_components + 1);
-  device_buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
-  device_buffer<int> labels(d_alloc, stream, n);
-
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  //raft spectral clustering:
-  //
-  using index_type = int;
-  using value_type = T;
-
-  raft::handle_t r_handle;
-  r_handle.set_stream(stream);
-
-  //TODO: r_handle to be passed as argument;
-  //this will be fixed in a separate refactoring PR;
-
-  index_type *ro = src_offsets.data();
-  index_type *ci = dst_cols.data();
-  value_type *vs = dst_vals.data();
-
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    r_handle, ro, ci, vs, n, nnz};
-
-  index_type neigvs = n_components + 1;
-  index_type maxiter = 4000;  //default reset value (when set to 0);
-  value_type tol = 0.01;
-  index_type restart_iter = 15 + neigvs;  //what cugraph is using
-  auto t_exe_p = thrust::cuda::par.on(stream);
-  using thrust_exe_policy_t = decltype(t_exe_p);
-
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
-                                                          restart_iter, tol};
-
-  raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
-
-  //cluster computation here is irrelevant,
-  //hence define a no-op such solver to
-  //feed partition():
-  //
-  struct no_op_cluster_solver_t {
-    using index_type_t = index_type;
-    using size_type_t = index_type;
-    using value_type_t = value_type;
-
-    std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
-      size_type_t n_obs_vecs, size_type_t dim,
-      value_type_t const *__restrict__ obs,
-      index_type_t *__restrict__ codes) const {
-      return std::make_pair<value_type_t, index_type_t>(0, 0);
-    }
-  };
-
-  raft::spectral::partition(r_handle, t_exe_p, r_csr_m, eig_solver,
-                            no_op_cluster_solver_t{}, labels.data(),
-                            eigVals.data(), eigVecs.data());
-
-  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
-
-  CUDA_CHECK(cudaGetLastError());
-}
-}  // namespace Spectral
-}  // namespace MLCommon
diff --git a/cpp/src_prims/sparse/utils.h b/cpp/src_prims/sparse/utils.h
index 7d1496ccff..d735f32d9f 100644
--- a/cpp/src_prims/sparse/utils.h
+++ b/cpp/src_prims/sparse/utils.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 /**
  * Quantizes ncols to a valid blockdim, which is
@@ -43,5 +43,35 @@ inline int block_dim(value_idx ncols) {
 
   return blockdim;
 }
-};  // namespace Sparse
-};  // namespace MLCommon
+
+template <typename value_idx>
+__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
+  int row = blockIdx.x;
+  int tid = threadIdx.x;
+
+  for (int i = tid; i < ncols; i += blockDim.x) {
+    indices[row * ncols + i] = i;
+  }
+}
+
+template <typename value_idx>
+void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
+               cudaStream_t stream) {
+  int blockdim = block_dim(ncols);
+
+  iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
+}
+
+template <typename T>
+__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
+  int stop_idx = 0;
+  if (row < (m - 1))
+    stop_idx = ind[row + 1];
+  else
+    stop_idx = nnz;
+
+  return stop_idx;
+}
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 8ba5d9bdfd..970c921754 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -139,8 +139,7 @@ if(BUILD_PRIMS_TESTS)
     prims/columnSort.cu
     prims/completeness_score.cu
     prims/contingencyMatrix.cu
-    prims/coo.cu
-    prims/cov.cu
+          prims/cov.cu
     prims/csr.cu
     prims/cutlass_gemm.cu
     prims/decoupled_lookback.cu
@@ -190,12 +189,21 @@ if(BUILD_PRIMS_TESTS)
     prims/score.cu
     prims/seive.cu
     prims/sigmoid.cu
+    prims/sparse/add.cu
+    prims/sparse/convert_coo.cu
+    prims/sparse/convert_csr.cu
     prims/sparse/csr_row_slice.cu
     prims/sparse/csr_to_dense.cu
     prims/sparse/csr_transpose.cu
+    prims/sparse/degree.cu
     prims/sparse/distance.cu
+    prims/sparse/filter.cu
     prims/sparse/knn.cu
+    prims/sparse/norm.cu
+    prims/sparse/row_op.cu
     prims/sparse/selection.cu
+    prims/sparse/sort.cu
+    prims/sparse/symmetrize.cu
     prims/silhouette_score.cu
     prims/sqrt.cu
     prims/ternary_op.cu
diff --git a/cpp/test/prims/columnSort.cu b/cpp/test/prims/columnSort.cu
index b9af7dfa81..657b0585e4 100644
--- a/cpp/test/prims/columnSort.cu
+++ b/cpp/test/prims/columnSort.cu
@@ -25,18 +25,14 @@ namespace MLCommon {
 namespace Selection {
 
 template <typename T>
-std::vector<int> *sort_indexes(const std::vector<T> &v, bool ascending) {
+std::vector<int> *sort_indexes(const std::vector<T> &v) {
   // initialize original index locations
   std::vector<int> *idx = new std::vector<int>(v.size());
   std::iota((*idx).begin(), (*idx).end(), 0);
 
   // sort indexes based on comparing values in v
-  std::sort((*idx).begin(), (*idx).end(), [&v, ascending](int i1, int i2) {
-    if (ascending)
-      return v[i1] < v[i2];
-    else
-      return v[i1] > v[i2];
-  });
+  std::sort((*idx).begin(), (*idx).end(),
+            [&v](int i1, int i2) { return v[i1] < v[i2]; });
   return idx;
 }
 
@@ -46,7 +42,6 @@ struct columnSort {
   int n_row;
   int n_col;
   bool testKeys;
-  bool ascending;
 };
 
 template <typename T>
@@ -81,7 +76,7 @@ class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
     for (int i = 0; i < params.n_row; i++) {
       std::vector<T> tmp(vals.begin() + i * params.n_col,
                          vals.begin() + (i + 1) * params.n_col);
-      auto cpuOut = sort_indexes(tmp, params.ascending);
+      auto cpuOut = sort_indexes(tmp);
       std::copy((*cpuOut).begin(), (*cpuOut).end(),
                 cValGolden.begin() + i * params.n_col);
       delete cpuOut;
@@ -101,17 +96,9 @@ class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
 
     bool needWorkspace = false;
     size_t workspaceSize = 0;
-    if (!params.ascending) {
-      // Remove this branch once the implementation of descending sort is fixed.
-      EXPECT_THROW(sortColumnsPerRow(
-                     keyIn, valueOut, params.n_row, params.n_col, needWorkspace,
-                     NULL, workspaceSize, stream, keySorted, params.ascending),
-                   raft::exception);
-    } else {
-      sortColumnsPerRow(keyIn, valueOut, params.n_row, params.n_col,
-                        needWorkspace, NULL, workspaceSize, stream, keySorted,
-                        params.ascending);
-    }
+    // Remove this branch once the implementation of descending sort is fixed.
+    sortColumnsPerRow(keyIn, valueOut, params.n_row, params.n_col,
+                      needWorkspace, NULL, workspaceSize, stream, keySorted);
     if (needWorkspace) {
       raft::allocate(workspacePtr, workspaceSize);
       sortColumnsPerRow(keyIn, valueOut, params.n_row, params.n_col,
@@ -141,24 +128,21 @@ class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
   char *workspacePtr = NULL;
 };
 
-const std::vector<columnSort<float>> inputsf1 = {
-  {0.000001f, 503, 2000, false, false}, {0.000001f, 503, 2000, true, false},
-  {0.000001f, 113, 20000, true, false}, {0.000001f, 5, 300000, true, false},
-  {0.000001f, 503, 2000, false, true},  {0.000001f, 503, 2000, true, true},
-  {0.000001f, 113, 20000, true, true},  {0.000001f, 5, 300000, true, true}};
+const std::vector<columnSort<float>> inputsf1 = {{0.000001f, 503, 2000, false},
+                                                 {0.000001f, 113, 20000, true},
+                                                 {0.000001f, 503, 2000, false},
+                                                 {0.000001f, 113, 20000, true}};
 
 typedef ColumnSort<float> ColumnSortF;
 TEST_P(ColumnSortF, Result) {
   // Remove this condition once the implementation of of descending sort is
   // fixed.
-  if (params.ascending) {
-    ASSERT_TRUE(devArrMatch(valueOut, goldenValOut, params.n_row * params.n_col,
+  ASSERT_TRUE(devArrMatch(valueOut, goldenValOut, params.n_row * params.n_col,
+                          raft::CompareApprox<float>(params.tolerance)));
+  if (params.testKeys) {
+    ASSERT_TRUE(devArrMatch(keySorted, keySortGolden,
+                            params.n_row * params.n_col,
                             raft::CompareApprox<float>(params.tolerance)));
-    if (params.testKeys) {
-      ASSERT_TRUE(devArrMatch(keySorted, keySortGolden,
-                              params.n_row * params.n_col,
-                              raft::CompareApprox<float>(params.tolerance)));
-    }
   }
 }
 
diff --git a/cpp/test/prims/coo.cu b/cpp/test/prims/coo.cu
deleted file mode 100644
index 812724a105..0000000000
--- a/cpp/test/prims/coo.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
-#include <sparse/coo.cuh>
-#include "coo.h"
-#include "test_utils.h"
-
-#include <iostream>
-
-namespace MLCommon {
-namespace Sparse {
-
-template <typename T>
-class COOTest : public ::testing::TestWithParam<COOInputs<T>> {
- protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
-
- protected:
-  COOInputs<T> params;
-};
-
-const std::vector<COOInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
-
-typedef COOTest<float> SortedCOOToCSR;
-TEST_P(SortedCOOToCSR, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  std::shared_ptr<deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-
-  int nnz = 8;
-
-  int *in, *out, *exp;
-
-  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *exp_h = new int[4]{0, 2, 4, 6};
-
-  raft::allocate(in, nnz, true);
-  raft::allocate(exp, 4, true);
-  raft::allocate(out, 4, true);
-
-  raft::update_device(in, in_h, nnz, stream);
-  raft::update_device(exp, exp_h, 4, stream);
-
-  sorted_coo_to_csr<int>(in, nnz, out, 4, alloc, stream);
-
-  ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
-
-  cudaStreamDestroy(stream);
-
-  delete[] in_h;
-  delete[] exp_h;
-
-  CUDA_CHECK(cudaFree(in));
-  CUDA_CHECK(cudaFree(exp));
-  CUDA_CHECK(cudaFree(out));
-}
-
-typedef COOTest<float> COOSymmetrize;
-TEST_P(COOSymmetrize, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  std::shared_ptr<deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-
-  int nnz = 8;
-
-  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
-  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
-
-  int *exp_rows_h =
-    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
-  int *exp_cols_h =
-    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
-  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
-                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
-
-  COO<float> in(alloc, stream, nnz, 4, 4);
-  raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
-  raft::update_device(in.cols(), *&in_cols_h, nnz, stream);
-  raft::update_device(in.vals(), *&in_vals_h, nnz, stream);
-
-  COO<float> out(alloc, stream);
-
-  coo_symmetrize<32, float>(
-    &in, &out,
-    [] __device__(int row, int col, float val, float trans) {
-      return val + trans;
-    },
-    alloc, stream);
-
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  std::cout << out << std::endl;
-
-  ASSERT_TRUE(out.nnz == nnz * 2);
-  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
-                                       raft::Compare<float>()));
-
-  cudaStreamDestroy(stream);
-
-  delete[] in_rows_h;
-  delete[] in_cols_h;
-  delete[] in_vals_h;
-
-  delete[] exp_rows_h;
-  delete[] exp_cols_h;
-  delete[] exp_vals_h;
-}
-
-typedef COOTest<float> COOSort;
-TEST_P(COOSort, Result) {
-  int *in_rows, *in_cols, *verify;
-  float *in_vals;
-
-  params = ::testing::TestWithParam<COOInputs<float>>::GetParam();
-  raft::random::Rng r(params.seed);
-  cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-
-  raft::allocate(in_vals, params.nnz);
-  r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
-
-  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
-  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
-  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
-
-  for (int i = 0; i < params.nnz; i++) {
-    in_rows_h[i] = params.nnz - i - 1;
-    verify_h[i] = i;
-    in_cols_h[i] = i;
-  }
-
-  raft::allocate(in_rows, params.nnz);
-  raft::allocate(in_cols, params.nnz);
-  raft::allocate(verify, params.nnz);
-
-  raft::update_device(in_rows, in_rows_h, params.nnz, stream);
-
-  raft::update_device(in_cols, in_cols_h, params.nnz, stream);
-  raft::update_device(verify, verify_h, params.nnz, stream);
-
-  coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
-           stream);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
-
-  delete[] in_rows_h;
-  delete[] in_cols_h;
-  delete[] verify_h;
-
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(in_cols));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-typedef COOTest<float> COORemoveZeros;
-TEST_P(COORemoveZeros, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  std::shared_ptr<deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-  params = ::testing::TestWithParam<COOInputs<float>>::GetParam();
-
-  float *in_h_vals = new float[params.nnz];
-
-  COO<float> in(alloc, stream, params.nnz, 5, 5);
-
-  raft::random::Rng r(params.seed);
-  r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream);
-
-  raft::update_host(in_h_vals, in.vals(), params.nnz, stream);
-
-  in_h_vals[0] = 0;
-  in_h_vals[2] = 0;
-  in_h_vals[3] = 0;
-
-  int *in_h_rows = new int[params.nnz];
-  int *in_h_cols = new int[params.nnz];
-
-  for (int i = 0; i < params.nnz; i++) {
-    in_h_rows[i] = params.nnz - i - 1;
-    in_h_cols[i] = i;
-  }
-
-  raft::update_device(in.rows(), in_h_rows, params.nnz, stream);
-  raft::update_device(in.cols(), in_h_cols, params.nnz, stream);
-  raft::update_device(in.vals(), in_h_vals, params.nnz, stream);
-
-  coo_sort<float>(&in, alloc, stream);
-
-  int out_rows_ref_h[2] = {0, 3};
-  int out_cols_ref_h[2] = {4, 1};
-
-  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
-  out_vals_ref_h[0] = in_h_vals[4];
-  out_vals_ref_h[1] = in_h_vals[1];
-
-  COO<float> out_ref(alloc, stream, 2, 5, 5);
-  COO<float> out(alloc, stream);
-
-  raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream);
-  raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
-  raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
-
-  coo_remove_zeros<32, float>(&in, &out, alloc, stream);
-
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
-                                       raft::Compare<float>()));
-
-  CUDA_CHECK(cudaStreamDestroy(stream));
-  free(out_vals_ref_h);
-
-  delete[] in_h_rows;
-  delete[] in_h_cols;
-  delete[] in_h_vals;
-}
-
-typedef COOTest<float> COORowCount;
-TEST_P(COORowCount, Result) {
-  int *in_rows, *verify, *results;
-
-  int in_rows_h[5] = {0, 0, 1, 2, 2};
-  int verify_h[5] = {2, 1, 2, 0, 0};
-
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
-
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
-
-  coo_row_count<32>(in_rows, 5, results, 0);
-  cudaDeviceSynchronize();
-
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
-
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
-}
-
-typedef COOTest<float> COORowCountNonzero;
-TEST_P(COORowCountNonzero, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *in_rows, *verify, *results;
-  float *in_vals;
-
-  int in_rows_h[5] = {0, 0, 1, 2, 2};
-  float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
-  int verify_h[5] = {1, 0, 2, 0, 0};
-
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
-  raft::allocate(in_vals, 5, true);
-
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
-  raft::update_device(in_vals, *&in_vals_h, 5, 0);
-
-  coo_row_count_nz<32, float>(in_rows, in_vals, 5, results, stream);
-  cudaDeviceSynchronize();
-
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
-
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
-
-  CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-INSTANTIATE_TEST_CASE_P(COOTests, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(COOTests, COOSort, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(COOTests, COORemoveZeros, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(COOTests, COORowCount, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(COOTests, COORowCountNonzero,
-                        ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(COOTests, COOSymmetrize, ::testing::ValuesIn(inputsf));
-
-}  // namespace Sparse
-}  // namespace MLCommon
diff --git a/cpp/test/prims/coo.h b/cpp/test/prims/coo.h
deleted file mode 100644
index eb29f77243..0000000000
--- a/cpp/test/prims/coo.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace MLCommon {
-
-namespace Sparse {
-
-template <typename T>
-struct COOInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
-};
-
-template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const COOInputs<T> &dims) {
-  return os;
-}
-}  // namespace Sparse
-}  // namespace MLCommon
diff --git a/cpp/test/prims/csr.cu b/cpp/test/prims/csr.cu
index 00c47fcd13..9b2c3c3ca2 100644
--- a/cpp/test/prims/csr.cu
+++ b/cpp/test/prims/csr.cu
@@ -25,8 +25,8 @@
 #include <iostream>
 #include <limits>
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 template <typename T>
 class CSRTest : public ::testing::TestWithParam<CSRInputs<T>> {
@@ -41,255 +41,12 @@ class CSRTest : public ::testing::TestWithParam<CSRInputs<T>> {
 
 const std::vector<CSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
-typedef CSRTest<float> CSRToCOO;
-TEST_P(CSRToCOO, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  int *result, *verify;
-
-  int *ex_scan_h = new int[4]{0, 4, 8, 9};
-  int *verify_h = new int[10]{0, 0, 0, 0, 1, 1, 1, 1, 2, 3};
-
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, ex_scan_h, 4, stream);
-  raft::update_device(verify, verify_h, 10, stream);
-
-  csr_to_coo<int, 32>(ex_scan, 4, result, 10, stream);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify, result, 10, raft::Compare<float>(), stream));
-
-  delete[] ex_scan_h;
-  delete[] verify_h;
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-
-  cudaStreamDestroy(stream);
-}
-
-typedef CSRTest<float> CSRRowNormalizeMax;
-TEST_P(CSRRowNormalizeMax, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  float *in_vals, *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-  float in_vals_h[10] = {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0};
-
-  float verify_h[10] = {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0};
-
-  raft::allocate(in_vals, 10);
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(in_vals, *&in_vals_h, 10, stream);
-  raft::update_device(verify, *&verify_h, 10, stream);
-
-  csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result, stream);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
-
-  cudaStreamDestroy(stream);
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
-
-typedef CSRTest<float> CSRRowNormalizeL1;
-TEST_P(CSRRowNormalizeL1, Result) {
-  int *ex_scan;
-  float *in_vals, *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-  float in_vals_h[10] = {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0};
-
-  float verify_h[10] = {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0};
-
-  raft::allocate(in_vals, 10);
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, 0);
-  raft::update_device(in_vals, *&in_vals_h, 10, 0);
-  raft::update_device(verify, *&verify_h, 10, 0);
-
-  csr_row_normalize_l1<32, float>(ex_scan, in_vals, 10, 4, result, 0);
-  cudaDeviceSynchronize();
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
-
-typedef CSRTest<float> CSRSum;
-TEST_P(CSRSum, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  std::shared_ptr<deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-
-  int *ex_scan, *ind_ptr_a, *ind_ptr_b, *verify_indptr;
-  float *in_vals_a, *in_vals_b, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-
-  int indptr_a_h[10] = {1, 2, 3, 4, 1, 2, 3, 5, 0, 1};
-  int indptr_b_h[10] = {1, 2, 5, 4, 0, 2, 3, 5, 1, 0};
-
-  float in_vals_h[10] = {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0};
-
-  float verify_h[14] = {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0,
-                        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  int verify_indptr_h[14] = {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0};
-
-  raft::allocate(in_vals_a, 10);
-  raft::allocate(in_vals_b, 10);
-  raft::allocate(verify, 14);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(verify_indptr, 14);
-
-  raft::allocate(ind_ptr_a, 10);
-  raft::allocate(ind_ptr_b, 10);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(in_vals_a, *&in_vals_h, 10, stream);
-  raft::update_device(in_vals_b, *&in_vals_h, 10, stream);
-  raft::update_device(verify, *&verify_h, 14, stream);
-  raft::update_device(verify_indptr, *&verify_indptr_h, 14, stream);
-  raft::update_device(ind_ptr_a, *&indptr_a_h, 10, stream);
-  raft::update_device(ind_ptr_b, *&indptr_b_h, 10, stream);
-
-  int *result_ind;
-  raft::allocate(result_ind, 4);
-
-  int nnz = csr_add_calc_inds<float, 32>(ex_scan, ind_ptr_a, in_vals_a, 10,
-                                         ex_scan, ind_ptr_b, in_vals_b, 10, 4,
-                                         result_ind, alloc, stream);
-
-  int *result_indptr;
-  float *result_val;
-  raft::allocate(result_indptr, nnz);
-  raft::allocate(result_val, nnz);
-
-  csr_add_finalize<float, 32>(ex_scan, ind_ptr_a, in_vals_a, 10, ex_scan,
-                              ind_ptr_b, in_vals_b, 10, 4, result_ind,
-                              result_indptr, result_val, stream);
-
-  ASSERT_TRUE(nnz == 14);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result_val, nnz, raft::Compare<float>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(verify_indptr, result_indptr, nnz,
-                                     raft::Compare<int>()));
-
-  cudaStreamDestroy(stream);
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals_a));
-  CUDA_CHECK(cudaFree(in_vals_b));
-  CUDA_CHECK(cudaFree(ind_ptr_a));
-  CUDA_CHECK(cudaFree(ind_ptr_b));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result_indptr));
-  CUDA_CHECK(cudaFree(result_val));
-}
-
-typedef CSRTest<float> CSRRowOpTest;
-TEST_P(CSRRowOpTest, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  float *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-
-  float verify_h[10] = {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0};
-
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(verify, *&verify_h, 10, stream);
-
-  csr_row_op<int, 32>(
-    ex_scan, 4, 10,
-    [result] __device__(int row, int start_idx, int stop_idx) {
-      for (int i = start_idx; i < stop_idx; i++) result[i] = row;
-    },
-    stream);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
-
-  cudaStreamDestroy(stream);
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
-
-typedef CSRTest<float> AdjGraphTest;
-TEST_P(AdjGraphTest, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *row_ind, *result, *verify;
-  bool *adj;
-
-  int row_ind_h[3] = {0, 3, 6};
-  bool adj_h[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-  int verify_h[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
-
-  raft::allocate(row_ind, 3);
-  raft::allocate(adj, 18);
-  raft::allocate(result, 9, true);
-  raft::allocate(verify, 9);
-
-  raft::update_device(row_ind, *&row_ind_h, 3, stream);
-  raft::update_device(adj, *&adj_h, 18, stream);
-  raft::update_device(verify, *&verify_h, 9, stream);
-
-  csr_adj_graph_batched<int, 32>(row_ind, 6, 9, 3, adj, result, stream);
-
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, result, 9, raft::Compare<int>()));
-
-  cudaStreamDestroy(stream);
-
-  CUDA_CHECK(cudaFree(row_ind));
-  CUDA_CHECK(cudaFree(adj));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
-
 typedef CSRTest<float> WeakCCTest;
 TEST_P(WeakCCTest, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  std::shared_ptr<deviceAllocator> alloc(
+  std::shared_ptr<MLCommon::deviceAllocator> alloc(
     new raft::mr::device::default_allocator);
   int *row_ind, *row_ind_ptr, *result, *verify;
 
@@ -306,9 +63,9 @@ TEST_P(WeakCCTest, Result) {
   raft::allocate(result, 9, true);
   raft::allocate(verify, 9);
 
-  device_buffer<bool> xa(alloc, stream, 6);
-  device_buffer<bool> fa(alloc, stream, 6);
-  device_buffer<bool> m(alloc, stream, 1);
+  MLCommon::device_buffer<bool> xa(alloc, stream, 6);
+  MLCommon::device_buffer<bool> fa(alloc, stream, 6);
+  MLCommon::device_buffer<bool> m(alloc, stream, 1);
   WeakCCState state(xa.data(), fa.data(), m.data());
 
   /**
@@ -348,18 +105,5 @@ TEST_P(WeakCCTest, Result) {
 
 INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTest, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(CSRTests, AdjGraphTest, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowOpTest, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(CSRTests, CSRToCOO, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowNormalizeMax,
-                        ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowNormalizeL1,
-                        ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(CSRTests, CSRSum, ::testing::ValuesIn(inputsf));
-}  // namespace Sparse
-}  // namespace MLCommon
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/csr.h b/cpp/test/prims/csr.h
index 6e732cb104..fedeae91e0 100644
--- a/cpp/test/prims/csr.h
+++ b/cpp/test/prims/csr.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-namespace MLCommon {
+namespace raft {
 
-namespace Sparse {
+namespace sparse {
 
 template <typename T>
 struct CSRInputs {
@@ -30,5 +30,5 @@ template <typename T>
 ::std::ostream &operator<<(::std::ostream &os, const CSRInputs<T> &dims) {
   return os;
 }
-}  // namespace Sparse
-}  // namespace MLCommon
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/add.cu b/cpp/test/prims/sparse/add.cu
new file mode 100644
index 0000000000..d4a5bd9166
--- /dev/null
+++ b/cpp/test/prims/sparse/add.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <sparse/csr.cuh>
+#include <sparse/linalg/add.cuh>
+
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseAddInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseAddTest : public ::testing::TestWithParam<SparseAddInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseAddInputs<T> params;
+};
+
+const std::vector<SparseAddInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseAddTest<float> CSRSum;
+TEST_P(CSRSum, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  int *ex_scan, *ind_ptr_a, *ind_ptr_b, *verify_indptr;
+  float *in_vals_a, *in_vals_b, *verify;
+
+  int ex_scan_h[4] = {0, 4, 8, 9};
+
+  int indptr_a_h[10] = {1, 2, 3, 4, 1, 2, 3, 5, 0, 1};
+  int indptr_b_h[10] = {1, 2, 5, 4, 0, 2, 3, 5, 1, 0};
+
+  float in_vals_h[10] = {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0};
+
+  float verify_h[14] = {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0,
+                        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  int verify_indptr_h[14] = {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0};
+
+  raft::allocate(in_vals_a, 10);
+  raft::allocate(in_vals_b, 10);
+  raft::allocate(verify, 14);
+  raft::allocate(ex_scan, 4);
+  raft::allocate(verify_indptr, 14);
+
+  raft::allocate(ind_ptr_a, 10);
+  raft::allocate(ind_ptr_b, 10);
+
+  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
+  raft::update_device(in_vals_a, *&in_vals_h, 10, stream);
+  raft::update_device(in_vals_b, *&in_vals_h, 10, stream);
+  raft::update_device(verify, *&verify_h, 14, stream);
+  raft::update_device(verify_indptr, *&verify_indptr_h, 14, stream);
+  raft::update_device(ind_ptr_a, *&indptr_a_h, 10, stream);
+  raft::update_device(ind_ptr_b, *&indptr_b_h, 10, stream);
+
+  int *result_ind;
+  raft::allocate(result_ind, 4);
+
+  int nnz = linalg::csr_add_calc_inds<float, 32>(
+    ex_scan, ind_ptr_a, in_vals_a, 10, ex_scan, ind_ptr_b, in_vals_b, 10, 4,
+    result_ind, alloc, stream);
+
+  int *result_indptr;
+  float *result_val;
+  raft::allocate(result_indptr, nnz);
+  raft::allocate(result_val, nnz);
+
+  linalg::csr_add_finalize<float, 32>(
+    ex_scan, ind_ptr_a, in_vals_a, 10, ex_scan, ind_ptr_b, in_vals_b, 10, 4,
+    result_ind, result_indptr, result_val, stream);
+
+  ASSERT_TRUE(nnz == 14);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<float>(verify, result_val, nnz, raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify_indptr, result_indptr, nnz,
+                                     raft::Compare<int>()));
+
+  cudaStreamDestroy(stream);
+
+  CUDA_CHECK(cudaFree(ex_scan));
+  CUDA_CHECK(cudaFree(in_vals_a));
+  CUDA_CHECK(cudaFree(in_vals_b));
+  CUDA_CHECK(cudaFree(ind_ptr_a));
+  CUDA_CHECK(cudaFree(ind_ptr_b));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result_indptr));
+  CUDA_CHECK(cudaFree(result_val));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRSum, ::testing::ValuesIn(inputsf));
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/convert_coo.cu b/cpp/test/prims/sparse/convert_coo.cu
new file mode 100644
index 0000000000..a44023e20a
--- /dev/null
+++ b/cpp/test/prims/sparse/convert_coo.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <sparse/convert/coo.cuh>
+#include <sparse/csr.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <test_utils.h>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseConvertCOOInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseConvertCOOTest
+  : public ::testing::TestWithParam<SparseConvertCOOInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseConvertCOOInputs<T> params;
+};
+
+const std::vector<SparseConvertCOOInputs<float>> inputsf = {
+  {5, 10, 5, 1234ULL}};
+
+typedef SparseConvertCOOTest<float> CSRToCOO;
+TEST_P(CSRToCOO, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *ex_scan;
+  int *result, *verify;
+
+  int *ex_scan_h = new int[4]{0, 4, 8, 9};
+  int *verify_h = new int[10]{0, 0, 0, 0, 1, 1, 1, 1, 2, 3};
+
+  raft::allocate(verify, 10);
+  raft::allocate(ex_scan, 4);
+  raft::allocate(result, 10, true);
+
+  raft::update_device(ex_scan, ex_scan_h, 4, stream);
+  raft::update_device(verify, verify_h, 10, stream);
+
+  convert::csr_to_coo<int, 32>(ex_scan, 4, result, 10, stream);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify, result, 10, raft::Compare<float>(), stream));
+
+  delete[] ex_scan_h;
+  delete[] verify_h;
+
+  CUDA_CHECK(cudaFree(ex_scan));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result));
+
+  cudaStreamDestroy(stream);
+}
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRToCOO,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/convert_csr.cu b/cpp/test/prims/sparse/convert_csr.cu
new file mode 100644
index 0000000000..0470f68c47
--- /dev/null
+++ b/cpp/test/prims/sparse/convert_csr.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <raft/mr/device/allocator.hpp>
+#include <sparse/convert/csr.cuh>
+#include <sparse/coo.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseConvertCSRInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os,
+                           const SparseConvertCSRInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class SparseConvertCSRTest
+  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseConvertCSRInputs<T> params;
+};
+
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {
+  {5, 10, 5, 1234ULL}};
+
+typedef SparseConvertCSRTest<float> SortedCOOToCSR;
+TEST_P(SortedCOOToCSR, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  int nnz = 8;
+
+  int *in, *out, *exp;
+
+  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *exp_h = new int[4]{0, 2, 4, 6};
+
+  raft::allocate(in, nnz, true);
+  raft::allocate(exp, 4, true);
+  raft::allocate(out, 4, true);
+
+  raft::update_device(in, in_h, nnz, stream);
+  raft::update_device(exp, exp_h, 4, stream);
+
+  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, alloc, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
+
+  cudaStreamDestroy(stream);
+
+  delete[] in_h;
+  delete[] exp_h;
+
+  CUDA_CHECK(cudaFree(in));
+  CUDA_CHECK(cudaFree(exp));
+  CUDA_CHECK(cudaFree(out));
+}
+
+typedef SparseConvertCSRTest<float> AdjGraphTest;
+TEST_P(AdjGraphTest, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *row_ind, *result, *verify;
+  bool *adj;
+
+  int row_ind_h[3] = {0, 3, 6};
+  bool adj_h[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  int verify_h[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+
+  raft::allocate(row_ind, 3);
+  raft::allocate(adj, 18);
+  raft::allocate(result, 9, true);
+  raft::allocate(verify, 9);
+
+  raft::update_device(row_ind, *&row_ind_h, 3, stream);
+  raft::update_device(adj, *&adj_h, 18, stream);
+  raft::update_device(verify, *&verify_h, 9, stream);
+
+  convert::csr_adj_graph_batched<int, 32>(row_ind, 6, 9, 3, adj, result,
+                                          stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, result, 9, raft::Compare<int>()));
+
+  cudaStreamDestroy(stream);
+
+  CUDA_CHECK(cudaFree(row_ind));
+  CUDA_CHECK(cudaFree(adj));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
+                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, AdjGraphTest,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/csr_row_slice.cu b/cpp/test/prims/sparse/csr_row_slice.cu
index ab60e08b90..248ee27024 100644
--- a/cpp/test/prims/sparse/csr_row_slice.cu
+++ b/cpp/test/prims/sparse/csr_row_slice.cu
@@ -16,17 +16,21 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <sparse/op/slice.h>
+
 #include <test_utils.h>
-#include <sparse/csr.cuh>
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct CSRRowSliceInputs {
@@ -88,22 +92,20 @@ class CSRRowSliceTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     make_data();
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
     int csr_start_offset;
     int csr_stop_offset;
 
-    MLCommon::Sparse::csr_row_slice_indptr(
+    raft::sparse::op::csr_row_slice_indptr(
       params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset,
       &csr_stop_offset, stream);
 
-    MLCommon::Sparse::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
+    raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
                                              indices, data, out_indices,
                                              out_data, stream);
 
@@ -178,5 +180,5 @@ TEST_P(CSRRowSliceTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/csr_to_dense.cu b/cpp/test/prims/sparse/csr_to_dense.cu
index 3ffb6695f5..361147c574 100644
--- a/cpp/test/prims/sparse/csr_to_dense.cu
+++ b/cpp/test/prims/sparse/csr_to_dense.cu
@@ -16,17 +16,19 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <test_utils.h>
-#include <sparse/csr.cuh>
+#include <sparse/convert/dense.cuh>
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct CSRToDenseInputs {
@@ -75,17 +77,15 @@ class CSRToDenseTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
-    csr_to_dense(handle, params.nrows, params.ncols, indptr, indices, data,
-                 params.nrows, out, stream, true);
+    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices,
+                          data, params.nrows, out, stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
@@ -136,5 +136,5 @@ TEST_P(CSRToDenseTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/csr_transpose.cu b/cpp/test/prims/sparse/csr_transpose.cu
index 8aad95088a..7605fdd5f6 100644
--- a/cpp/test/prims/sparse/csr_transpose.cu
+++ b/cpp/test/prims/sparse/csr_transpose.cu
@@ -15,18 +15,23 @@
  */
 
 #include <cusparse_v2.h>
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 
 #include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <sparse/linalg/transpose.h>
+
 #include <test_utils.h>
-#include <sparse/csr.cuh>
 
-namespace MLCommon {
-namespace Sparse {
+namespace raft {
+namespace sparse {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct CSRTransposeInputs {
@@ -89,18 +94,16 @@ class CSRTransposeTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
-    csr_transpose(handle, indptr, indices, data, out_indptr, out_indices,
-                  out_data, params.nrows, params.ncols, params.nnz, alloc,
-                  stream);
+    raft::sparse::linalg::csr_transpose(
+      handle, indptr, indices, data, out_indptr, out_indices, out_data,
+      params.nrows, params.ncols, params.nnz, alloc, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
@@ -167,5 +170,5 @@ TEST_P(CSRTransposeTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/degree.cu b/cpp/test/prims/sparse/degree.cu
new file mode 100644
index 0000000000..33951f5e40
--- /dev/null
+++ b/cpp/test/prims/sparse/degree.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <sparse/linalg/degree.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseDegreeInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseDegreeTests
+  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseDegreeInputs<T> params;
+};
+
+const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseDegreeTests<float> COODegree;
+TEST_P(COODegree, Result) {
+  int *in_rows, *verify, *results;
+
+  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  int verify_h[5] = {2, 1, 2, 0, 0};
+
+  raft::allocate(in_rows, 5);
+  raft::allocate(verify, 5, true);
+  raft::allocate(results, 5, true);
+
+  raft::update_device(in_rows, *&in_rows_h, 5, 0);
+  raft::update_device(verify, *&verify_h, 5, 0);
+
+  linalg::coo_degree<32>(in_rows, 5, results, 0);
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(verify));
+}
+
+typedef SparseDegreeTests<float> COODegreeNonzero;
+TEST_P(COODegreeNonzero, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *in_rows, *verify, *results;
+  float *in_vals;
+
+  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
+  int verify_h[5] = {1, 0, 2, 0, 0};
+
+  raft::allocate(in_rows, 5);
+  raft::allocate(verify, 5, true);
+  raft::allocate(results, 5, true);
+  raft::allocate(in_vals, 5, true);
+
+  raft::update_device(in_rows, *&in_rows_h, 5, 0);
+  raft::update_device(verify, *&verify_h, 5, 0);
+  raft::update_device(in_vals, *&in_vals_h, 5, 0);
+
+  linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream);
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(verify));
+
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
+                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/distance.cu b/cpp/test/prims/sparse/distance.cu
index cc8ea315cc..30510021c1 100644
--- a/cpp/test/prims/sparse/distance.cu
+++ b/cpp/test/prims/sparse/distance.cu
@@ -21,18 +21,18 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
 
-#include <common/device_buffer.hpp>
-
-#include <sparse/distance.cuh>
+#include <sparse/distance/distance.cuh>
 
 #include <test_utils.h>
 
-namespace MLCommon {
-namespace Sparse {
-namespace Selection {
+namespace raft {
+namespace sparse {
+namespace selection {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct SparseDistanceInputs {
@@ -81,7 +81,7 @@ class SparseDistanceTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseDistanceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -89,7 +89,7 @@ class SparseDistanceTest
 
     make_data();
 
-    Distance::distances_config_t<value_idx, value_t> dist_config;
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
     dist_config.b_nrows = params.indptr_h.size() - 1;
     dist_config.b_ncols = params.n_cols;
     dist_config.b_nnz = params.indices_h.size();
@@ -110,8 +110,6 @@ class SparseDistanceTest
 
     allocate(out_dists, out_size);
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
     pairwiseDistance(out_dists, dist_config, params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -183,6 +181,6 @@ TEST_P(SparseDistanceTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Selection
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace selection
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/filter.cu b/cpp/test/prims/sparse/filter.cu
new file mode 100644
index 0000000000..5473c3490b
--- /dev/null
+++ b/cpp/test/prims/sparse/filter.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <sparse/op/sort.h>
+#include <raft/mr/device/allocator.hpp>
+#include <sparse/coo.cuh>
+#include <sparse/op/filter.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseFilterInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseFilterTests
+  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseFilterInputs<T> params;
+};
+
+const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseFilterTests<float> COORemoveZeros;
+TEST_P(COORemoveZeros, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+  params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
+
+  float *in_h_vals = new float[params.nnz];
+
+  COO<float> in(alloc, stream, params.nnz, 5, 5);
+
+  raft::random::Rng r(params.seed);
+  r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream);
+
+  raft::update_host(in_h_vals, in.vals(), params.nnz, stream);
+
+  in_h_vals[0] = 0;
+  in_h_vals[2] = 0;
+  in_h_vals[3] = 0;
+
+  int *in_h_rows = new int[params.nnz];
+  int *in_h_cols = new int[params.nnz];
+
+  for (int i = 0; i < params.nnz; i++) {
+    in_h_rows[i] = params.nnz - i - 1;
+    in_h_cols[i] = i;
+  }
+
+  raft::update_device(in.rows(), in_h_rows, params.nnz, stream);
+  raft::update_device(in.cols(), in_h_cols, params.nnz, stream);
+  raft::update_device(in.vals(), in_h_vals, params.nnz, stream);
+
+  op::coo_sort<float>(&in, alloc, stream);
+
+  int out_rows_ref_h[2] = {0, 3};
+  int out_cols_ref_h[2] = {4, 1};
+
+  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
+  out_vals_ref_h[0] = in_h_vals[4];
+  out_vals_ref_h[1] = in_h_vals[1];
+
+  COO<float> out_ref(alloc, stream, 2, 5, 5);
+  COO<float> out(alloc, stream);
+
+  raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream);
+  raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
+  raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
+
+  op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
+                                       raft::Compare<float>()));
+
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  free(out_vals_ref_h);
+
+  delete[] in_h_rows;
+  delete[] in_h_cols;
+  delete[] in_h_vals;
+}
+
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/knn.cu b/cpp/test/prims/sparse/knn.cu
index 8696b8029e..0b924166a9 100644
--- a/cpp/test/prims/sparse/knn.cu
+++ b/cpp/test/prims/sparse/knn.cu
@@ -16,19 +16,21 @@
 
 #include <cusparse_v2.h>
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 
-#include <raft/sparse/cusparse_wrappers.h>
 #include <test_utils.h>
-#include <cuml/common/logger.hpp>
-#include <sparse/knn.cuh>
+#include <sparse/selection/knn.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
 
-namespace MLCommon {
-namespace Sparse {
-namespace Selection {
+namespace raft {
+namespace sparse {
+namespace selection {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct SparseKNNInputs {
@@ -90,21 +92,19 @@ class SparseKNNTest
   void SetUp() override {
     params =
       ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
     n_rows = params.indptr_h.size() - 1;
     nnz = params.indices_h.size();
     k = params.k;
 
     make_data();
 
-    brute_force_knn<value_idx, value_t>(
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
       indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
       nnz, n_rows, params.n_cols, out_indices, out_dists, k, cusparseHandle,
       alloc, stream, params.batch_size_index, params.batch_size_query,
@@ -169,6 +169,6 @@ TEST_P(KNNTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(SparseKNNTest, KNNTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Selection
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace selection
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/norm.cu b/cpp/test/prims/sparse/norm.cu
new file mode 100644
index 0000000000..6fdf37aaa0
--- /dev/null
+++ b/cpp/test/prims/sparse/norm.cu
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+#include <sparse/csr.cuh>
+#include <sparse/linalg/norm.cuh>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseNormInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseNormTest : public ::testing::TestWithParam<SparseNormInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseNormInputs<T> params;
+};
+
+const std::vector<SparseNormInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseNormTest<float> CSRRowNormalizeMax;
+TEST_P(CSRRowNormalizeMax, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *ex_scan;
+  float *in_vals, *result, *verify;
+
+  int ex_scan_h[4] = {0, 4, 8, 9};
+  float in_vals_h[10] = {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0};
+
+  float verify_h[10] = {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0};
+
+  raft::allocate(in_vals, 10);
+  raft::allocate(verify, 10);
+  raft::allocate(ex_scan, 4);
+  raft::allocate(result, 10, true);
+
+  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
+  raft::update_device(in_vals, *&in_vals_h, 10, stream);
+  raft::update_device(verify, *&verify_h, 10, stream);
+
+  linalg::csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result,
+                                           stream);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
+
+  cudaStreamDestroy(stream);
+
+  CUDA_CHECK(cudaFree(ex_scan));
+  CUDA_CHECK(cudaFree(in_vals));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result));
+}
+
+typedef SparseNormTest<float> CSRRowNormalizeL1;
+TEST_P(CSRRowNormalizeL1, Result) {
+  int *ex_scan;
+  float *in_vals, *result, *verify;
+
+  int ex_scan_h[4] = {0, 4, 8, 9};
+  float in_vals_h[10] = {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0};
+
+  float verify_h[10] = {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0};
+
+  raft::allocate(in_vals, 10);
+  raft::allocate(verify, 10);
+  raft::allocate(ex_scan, 4);
+  raft::allocate(result, 10, true);
+
+  raft::update_device(ex_scan, *&ex_scan_h, 4, 0);
+  raft::update_device(in_vals, *&in_vals_h, 10, 0);
+  raft::update_device(verify, *&verify_h, 10, 0);
+
+  linalg::csr_row_normalize_l1<32, float>(ex_scan, in_vals, 10, 4, result, 0);
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(
+    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
+
+  CUDA_CHECK(cudaFree(ex_scan));
+  CUDA_CHECK(cudaFree(in_vals));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeMax,
+                        ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeL1,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/row_op.cu b/cpp/test/prims/sparse/row_op.cu
new file mode 100644
index 0000000000..46ab3bfb5f
--- /dev/null
+++ b/cpp/test/prims/sparse/row_op.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <sparse/csr.cuh>
+#include <sparse/op/row_op.cuh>
+
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseRowOpInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseRowOpTest : public ::testing::TestWithParam<SparseRowOpInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseRowOpInputs<T> params;
+};
+
+const std::vector<SparseRowOpInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseRowOpTest<float> CSRRowOpTest;
+TEST_P(CSRRowOpTest, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *ex_scan;
+  float *result, *verify;
+
+  int ex_scan_h[4] = {0, 4, 8, 9};
+
+  float verify_h[10] = {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0};
+
+  raft::allocate(verify, 10);
+  raft::allocate(ex_scan, 4);
+  raft::allocate(result, 10, true);
+
+  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
+  raft::update_device(verify, *&verify_h, 10, stream);
+
+  op::csr_row_op<int, 32>(
+    ex_scan, 4, 10,
+    [result] __device__(int row, int start_idx, int stop_idx) {
+      for (int i = start_idx; i < stop_idx; i++) result[i] = row;
+    },
+    stream);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
+
+  cudaStreamDestroy(stream);
+
+  CUDA_CHECK(cudaFree(ex_scan));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaFree(result));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTest,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/selection.cu b/cpp/test/prims/sparse/selection.cu
index f1d34a6bf5..b8ffe028ef 100644
--- a/cpp/test/prims/sparse/selection.cu
+++ b/cpp/test/prims/sparse/selection.cu
@@ -19,13 +19,16 @@
 
 #include <test_utils.h>
 #include <cuml/common/logger.hpp>
-#include <sparse/selection.cuh>
 
-namespace MLCommon {
-namespace Sparse {
-namespace Selection {
+#include <sparse/utils.h>
+#include <sparse/selection/selection.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
 
 using namespace raft;
+using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct SparseSelectionInputs {
@@ -79,20 +82,19 @@ class SparseSelectionTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<deviceAllocator> alloc(
+    std::shared_ptr<raft::mr::device::allocator> alloc(
       new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    ML::Logger::get().setLevel(CUML_LEVEL_INFO);
-
     n_rows = params.n_rows;
     n_cols = params.n_cols;
     k = params.k;
 
     make_data();
 
-    select_k(dists, inds, n_rows, n_cols, out_dists, out_indices,
-             params.select_min, k, stream);
+    raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists,
+                                      out_indices, params.select_min, k,
+                                      stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -151,6 +153,6 @@ TEST_P(SparseSelectionTestF, Result) { compare(); }
 INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
-};  // end namespace Selection
-};  // end namespace Sparse
-};  // end namespace MLCommon
+};  // end namespace selection
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/prims/sparse/sort.cu b/cpp/test/prims/sparse/sort.cu
new file mode 100644
index 0000000000..e0a00ceff5
--- /dev/null
+++ b/cpp/test/prims/sparse/sort.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <sparse/op/sort.h>
+#include <raft/mr/device/allocator.hpp>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseSortInput {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseSortInput<T> params;
+};
+
+const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseSortTest<float> COOSort;
+TEST_P(COOSort, Result) {
+  int *in_rows, *in_cols, *verify;
+  float *in_vals;
+
+  params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
+  raft::random::Rng r(params.seed);
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  raft::allocate(in_vals, params.nnz);
+  r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
+
+  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
+  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
+  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
+
+  for (int i = 0; i < params.nnz; i++) {
+    in_rows_h[i] = params.nnz - i - 1;
+    verify_h[i] = i;
+    in_cols_h[i] = i;
+  }
+
+  raft::allocate(in_rows, params.nnz);
+  raft::allocate(in_cols, params.nnz);
+  raft::allocate(verify, params.nnz);
+
+  raft::update_device(in_rows, in_rows_h, params.nnz, stream);
+
+  raft::update_device(in_cols, in_cols_h, params.nnz, stream);
+  raft::update_device(verify, verify_h, params.nnz, stream);
+
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
+               stream);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
+
+  delete[] in_rows_h;
+  delete[] in_cols_h;
+  delete[] verify_h;
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(in_cols));
+  CUDA_CHECK(cudaFree(in_vals));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseSortTest, COOSort, ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/prims/sparse/symmetrize.cu b/cpp/test/prims/sparse/symmetrize.cu
new file mode 100644
index 0000000000..e2d9d1c1dd
--- /dev/null
+++ b/cpp/test/prims/sparse/symmetrize.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <raft/random/rng.cuh>
+
+#include <sparse/coo.cuh>
+#include <sparse/linalg/symmetrize.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseSymmetrizeInput {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseSymmetrizeTest
+  : public ::testing::TestWithParam<SparseSymmetrizeInput<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseSymmetrizeInput<T> params;
+};
+
+const std::vector<SparseSymmetrizeInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseSymmetrizeTest<float> COOSymmetrize;
+TEST_P(COOSymmetrize, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::shared_ptr<raft::mr::device::default_allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  int nnz = 8;
+
+  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+
+  int *exp_rows_h =
+    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int *exp_cols_h =
+    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
+                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+
+  COO<float> in(alloc, stream, nnz, 4, 4);
+  raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
+  raft::update_device(in.cols(), *&in_cols_h, nnz, stream);
+  raft::update_device(in.vals(), *&in_vals_h, nnz, stream);
+
+  COO<float> out(alloc, stream);
+
+  linalg::coo_symmetrize<32, float>(
+    &in, &out,
+    [] __device__(int row, int col, float val, float trans) {
+      return val + trans;
+    },
+    alloc, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  std::cout << out << std::endl;
+
+  ASSERT_TRUE(out.nnz == nnz * 2);
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
+                                       raft::Compare<float>()));
+
+  cudaStreamDestroy(stream);
+
+  delete[] in_rows_h;
+  delete[] in_cols_h;
+  delete[] in_vals_h;
+
+  delete[] exp_rows_h;
+  delete[] exp_cols_h;
+  delete[] exp_vals_h;
+}
+
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, COOSymmetrize,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft