Skip to content

Commit

Permalink
Preparing sparse primitives for movement to RAFT(#3157)
Browse files Browse the repository at this point in the history
This PR contains the initial steps to move many of the sparse prims API over to raft, including:
- [x] Adjusting `MLCommon::Sparse` namespaces to `raft::sparse` 
- [x] Breaking csr/coo prims into multiple files (e.g. linalg, components, matrix, etc...)
- [x] Using RAFT namespaces for raft componentry used within the sparse prims, such as `device_buffer` and `deviceAllocator`. 
- [x] Use RAFT handle in public API
Closes #3106

Authors:
  - Corey J. Nolet <[email protected]>
  - Ray Douglass <[email protected]>

Approvers:
  - Divye Gala (@divyegala)
  - Divye Gala (@divyegala)
  - John Zedlewski (@JohnZed)

URL: #3157
  • Loading branch information
cjnolet authored Jan 16, 2021
1 parent ecd508c commit d72c54a
Show file tree
Hide file tree
Showing 58 changed files with 3,362 additions and 2,534 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ if(OPENMP_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif(OPENMP_FOUND)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")

if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14")
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dbscan/adjgraph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "../common.cuh"
#include "pack.h"

#include <sparse/csr.cuh>
#include <sparse/convert/csr.cuh>

using namespace thrust;

Expand Down Expand Up @@ -54,7 +54,7 @@ void launcher(const raft::handle_t &handle, Pack<Index_> data, Index_ batchSize,
int minPts = data.minPts;
Index_ *vd = data.vd;

MLCommon::Sparse::csr_adj_graph_batched<Index_, TPB_X>(
raft::sparse::convert::csr_adj_graph_batched<Index_, TPB_X>(
data.ex_scan, data.N, data.adjnnz, batchSize, data.adj, data.adj_graph,
stream,
[core_pts, minPts, vd] __device__(Index_ row, Index_ start_idx,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dbscan/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
temp += exScanSize;

// Running VertexDeg
MLCommon::Sparse::WeakCCState state(xa, fa, m);
raft::sparse::WeakCCState state(xa, fa, m);
MLCommon::device_buffer<Index_> adj_graph(handle.get_device_allocator(),
stream);

Expand Down Expand Up @@ -190,7 +190,7 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
CUML_LOG_DEBUG("--> Computing connected components");

start_time = raft::curTimeMillis();
MLCommon::Sparse::weak_cc_batched<Index_, 1024>(
raft::sparse::weak_cc_batched<Index_, 1024>(
labels, ex_scan, adj_graph.data(), curradjlen, N, startVertexId, nPoints,
&state, stream,
[core_pts, startVertexId, nPoints] __device__(Index_ global_id) {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/knn/knn_sparse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cuml/common/logger.hpp>
#include <cuml/neighbors/knn_sparse.hpp>

#include <sparse/knn.cuh>
#include <sparse/selection/knn.cuh>

#include <cusparse_v2.h>

Expand All @@ -40,7 +40,7 @@ void brute_force_knn(raft::handle_t &handle, const int *idx_indptr,
cusparseHandle_t cusparse_handle = handle.get_cusparse_handle();
cudaStream_t stream = handle.get_stream();

MLCommon::Sparse::Selection::brute_force_knn(
raft::sparse::selection::brute_force_knn(
idx_indptr, idx_indices, idx_data, idx_nnz, n_idx_rows, n_idx_cols,
query_indptr, query_indices, query_data, query_nnz, n_query_rows,
n_query_cols, output_indices, output_dists, k, cusparse_handle, d_alloc,
Expand Down
8 changes: 3 additions & 5 deletions cpp/src/spectral/spectral.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <cuml/cuml.hpp>
#include <sparse/coo.cuh>

#include <sparse/spectral.cuh>
#include <sparse/linalg/spectral.cuh>

namespace ML {

Expand All @@ -38,10 +38,8 @@ namespace Spectral {
*/
void fit_embedding(const raft::handle_t &handle, int *rows, int *cols,
float *vals, int nnz, int n, int n_components, float *out) {
const auto &impl = handle;
MLCommon::Spectral::fit_embedding(
impl.get_cusparse_handle(), rows, cols, vals, nnz, n, n_components, out,
handle.get_device_allocator(), handle.get_stream());
raft::sparse::spectral::fit_embedding(handle, rows, cols, vals, nnz, n,
n_components, out);
}
} // namespace Spectral
} // namespace ML
16 changes: 8 additions & 8 deletions cpp/src/tsne/distances.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
#include <raft/linalg/eltwise.cuh>
#include <selection/knn.cuh>
#include <sparse/coo.cuh>
#include <sparse/knn.cuh>
#include <sparse/linalg/symmetrize.cuh>
#include <sparse/selection/knn.cuh>

#include <cuml/manifold/common.hpp>

Expand Down Expand Up @@ -82,7 +83,7 @@ template <>
void get_distances(const raft::handle_t &handle,
manifold_sparse_inputs_t<int, float> &input,
knn_graph<int, float> &k_graph, cudaStream_t stream) {
MLCommon::Sparse::Selection::brute_force_knn(
raft::sparse::selection::brute_force_knn(
input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
input.indptr, input.indices, input.data, input.nnz, input.n, input.d,
k_graph.knn_indices, k_graph.knn_dists, k_graph.n_neighbors,
Expand Down Expand Up @@ -135,17 +136,16 @@ void normalize_distances(const value_idx n, value_t *distances,
* @param[in] handle: The GPU handle.
*/
template <typename value_idx, typename value_t, int TPB_X = 32>
void symmetrize_perplexity(
float *P, value_idx *indices, const value_idx n, const int k,
const value_t exaggeration,
MLCommon::Sparse::COO<value_t, value_idx> *COO_Matrix, cudaStream_t stream,
const raft::handle_t &handle) {
void symmetrize_perplexity(float *P, value_idx *indices, const value_idx n,
const int k, const value_t exaggeration,
raft::sparse::COO<value_t, value_idx> *COO_Matrix,
cudaStream_t stream, const raft::handle_t &handle) {
// Perform (P + P.T) / P_sum * early_exaggeration
const value_t div = exaggeration / (2.0f * n);
raft::linalg::scalarMultiply(P, P, div, n * k, stream);

// Symmetrize to form P + P.T
MLCommon::Sparse::from_knn_symmetrize_matrix(
raft::sparse::linalg::from_knn_symmetrize_matrix<value_idx, value_t>(
indices, P, n, k, COO_Matrix, stream, handle.get_device_allocator());
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/tsne/tsne_runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class TSNE_runner {
const bool initialize_embeddings;
bool barnes_hut;

MLCommon::Sparse::COO<value_t, value_idx> COO_Matrix;
raft::sparse::COO<value_t, value_idx> COO_Matrix;
value_idx n, p;
value_t *Y;
};
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/umap/fuzzy_simpl_set/naive.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
#include <raft/cudart_utils.h>
#include <raft/cuda_utils.cuh>

#include <sparse/op/sort.h>
#include <raft/stats/mean.cuh>
#include <sparse/coo.cuh>
#include <sparse/linalg/symmetrize.cuh>

#include <cuda_runtime.h>

Expand Down Expand Up @@ -276,8 +278,8 @@ void smooth_knn_dist(int n, const value_idx *knn_indices,
* @param stream cuda stream to use for device operations
*/
template <int TPB_X, typename value_idx, typename value_t>
void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
int n_neighbors, MLCommon::Sparse::COO<value_t> *out,
void launcher(int n, const value_idx *knn_indices, const value_t *knn_dists,
int n_neighbors, raft::sparse::COO<value_t> *out,
UMAPParams *params, std::shared_ptr<deviceAllocator> d_alloc,
cudaStream_t stream) {
/**
Expand All @@ -292,7 +294,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
n, knn_indices, knn_dists, rhos.data(), sigmas.data(), params, n_neighbors,
params->local_connectivity, d_alloc, stream);

MLCommon::Sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);
raft::sparse::COO<value_t> in(d_alloc, stream, n * n_neighbors, n, n);

// check for logging in order to avoid the potentially costly `arr2Str` call!
if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) {
Expand Down Expand Up @@ -329,7 +331,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
* one via a fuzzy union. (Symmetrize knn graph).
*/
float set_op_mix_ratio = params->set_op_mix_ratio;
MLCommon::Sparse::coo_symmetrize<TPB_X, value_t>(
raft::sparse::linalg::coo_symmetrize<TPB_X, value_t>(
&in, out,
[set_op_mix_ratio] __device__(int row, int col, value_t result,
value_t transpose) {
Expand All @@ -340,7 +342,7 @@ void launcher(int n, const value_idx *knn_indices, const float *knn_dists,
},
d_alloc, stream);

MLCommon::Sparse::coo_sort<value_t>(out, d_alloc, stream);
raft::sparse::op::coo_sort<value_t>(out, d_alloc, stream);
}
} // namespace Naive
} // namespace FuzzySimplSet
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/umap/fuzzy_simpl_set/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ using namespace ML;
*/
template <int TPB_X, typename value_idx, typename T>
void run(int n, const value_idx *knn_indices, const T *knn_dists,
int n_neighbors, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
int n_neighbors, raft::sparse::COO<T> *coo, UMAPParams *params,
std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream,
int algorithm = 0) {
switch (algorithm) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/umap/init_embed/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ using namespace ML;
template <typename value_idx, typename T>
void run(const raft::handle_t &handle, int n, int d,
const value_idx *knn_indices, const T *knn_dists,
MLCommon::Sparse::COO<float> *coo, UMAPParams *params, T *embedding,
raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding,
cudaStream_t stream, int algo = 0) {
switch (algo) {
/**
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/umap/init_embed/spectral_algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ using namespace ML;
template <typename value_idx, typename T>
void launcher(const raft::handle_t &handle, int n, int d,
const value_idx *knn_indices, const T *knn_dists,
MLCommon::Sparse::COO<float> *coo, UMAPParams *params,
T *embedding) {
raft::sparse::COO<float> *coo, UMAPParams *params, T *embedding) {
cudaStream_t stream = handle.get_stream();

ASSERT(n > params->n_components,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/umap/knn_graph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <iostream>
#include <raft/linalg/unary_op.cuh>
#include <selection/knn.cuh>
#include <sparse/knn.cuh>
#include <sparse/selection/knn.cuh>

#include <raft/cudart_utils.h>

Expand Down Expand Up @@ -85,7 +85,7 @@ void launcher(const raft::handle_t &handle,
const ML::UMAPParams *params,
std::shared_ptr<ML::deviceAllocator> d_alloc,
cudaStream_t stream) {
MLCommon::Sparse::Selection::brute_force_knn(
raft::sparse::selection::brute_force_knn(
inputsA.indptr, inputsA.indices, inputsA.data, inputsA.nnz, inputsA.n,
inputsA.d, inputsB.indptr, inputsB.indices, inputsB.data, inputsB.nnz,
inputsB.n, inputsB.d, out.knn_indices, out.knn_dists, n_neighbors,
Expand Down
43 changes: 23 additions & 20 deletions cpp/src/umap/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@
#include <thrust/scan.h>
#include <thrust/system/cuda/execution_policy.h>

#include <sparse/op/sort.h>
#include <sparse/convert/csr.cuh>
#include <sparse/coo.cuh>
#include <sparse/csr.cuh>
#include <sparse/linalg/norm.cuh>
#include <sparse/op/filter.cuh>

#include <raft/cuda_utils.cuh>

Expand All @@ -51,7 +54,6 @@ namespace FuzzySimplSetImpl = FuzzySimplSet::Naive;
namespace SimplSetEmbedImpl = SimplSetEmbed::Algo;

using namespace ML;
using namespace MLCommon::Sparse;

template <int TPB_X, typename T>
__global__ void init_transform(int *indices, T *weights, int n,
Expand Down Expand Up @@ -126,7 +128,7 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");

ML::PUSH_RANGE("umap::simplicial_set");
COO<value_t> rgraph_coo(d_alloc, stream);
raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
FuzzySimplSet::run<TPB_X, value_idx, value_t>(
inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo,
params, d_alloc, stream);
Expand All @@ -135,8 +137,8 @@ void _fit(const raft::handle_t &handle, const umap_inputs &inputs,
/**
* Remove zeros from simplicial set
*/
COO<value_t> cgraph_coo(d_alloc, stream);
MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
raft::sparse::COO<value_t> cgraph_coo(d_alloc, stream);
raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, &cgraph_coo,
d_alloc, stream);
ML::POP_RANGE();

Expand Down Expand Up @@ -209,8 +211,8 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
* Allocate workspace for fuzzy simplicial set.
*/
ML::PUSH_RANGE("umap::simplicial_set");
COO<value_t> rgraph_coo(d_alloc, stream);
COO<value_t> tmp_coo(d_alloc, stream);
raft::sparse::COO<value_t> rgraph_coo(d_alloc, stream);
raft::sparse::COO<value_t> tmp_coo(d_alloc, stream);

/**
* Run Fuzzy simplicial set
Expand All @@ -221,10 +223,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
&tmp_coo, params, d_alloc, stream);
CUDA_CHECK(cudaPeekAtLastError());

MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo,
d_alloc, stream);

COO<value_t> final_coo(d_alloc, stream);
raft::sparse::COO<value_t> final_coo(d_alloc, stream);

/**
* If target metric is 'categorical', perform
Expand All @@ -247,10 +249,10 @@ void _fit_supervised(const raft::handle_t &handle, const umap_inputs &inputs,
/**
* Remove zeros
*/
MLCommon::Sparse::coo_sort<value_t>(&final_coo, d_alloc, stream);
raft::sparse::op::coo_sort<value_t>(&final_coo, d_alloc, stream);

COO<value_t> ocoo(d_alloc, stream);
MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
raft::sparse::COO<value_t> ocoo(d_alloc, stream);
raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&final_coo, &ocoo, d_alloc,
stream);
ML::POP_RANGE();

Expand Down Expand Up @@ -366,7 +368,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
* Allocate workspace for fuzzy simplicial set.
*/

COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n, inputs.n);
raft::sparse::COO<value_t> graph_coo(d_alloc, stream, nnz, inputs.n,
inputs.n);

FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
<<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices, knn_graph.knn_dists,
Expand All @@ -378,17 +381,17 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
MLCommon::device_buffer<int> row_ind(d_alloc, stream, inputs.n);
MLCommon::device_buffer<int> ia(d_alloc, stream, inputs.n);

MLCommon::Sparse::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
stream);
MLCommon::Sparse::coo_row_count<TPB_X>(&graph_coo, ia.data(), stream);
raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), d_alloc,
stream);
raft::sparse::linalg::coo_degree<TPB_X>(&graph_coo, ia.data(), stream);

MLCommon::device_buffer<value_t> vals_normed(d_alloc, stream, graph_coo.nnz);
CUDA_CHECK(cudaMemsetAsync(vals_normed.data(), 0,
graph_coo.nnz * sizeof(value_t), stream));

CUML_LOG_DEBUG("Performing L1 normalization");

MLCommon::Sparse::csr_row_normalize_l1<TPB_X, value_t>(
raft::sparse::linalg::csr_row_normalize_l1<TPB_X, value_t>(
row_ind.data(), graph_coo.vals(), graph_coo.nnz, graph_coo.n_rows,
vals_normed.data(), stream);

Expand All @@ -402,7 +405,7 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
CUDA_CHECK(cudaPeekAtLastError());

/**
* Go through COO values and set everything that's less than
* Go through raft::sparse::COO values and set everything that's less than
* vals.max() / params->n_epochs to 0.0
*/
thrust::device_ptr<value_t> d_ptr =
Expand Down Expand Up @@ -437,8 +440,8 @@ void _transform(const raft::handle_t &handle, const umap_inputs &inputs,
/**
* Remove zeros
*/
MLCommon::Sparse::COO<value_t> comp_coo(d_alloc, stream);
MLCommon::Sparse::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
raft::sparse::COO<value_t> comp_coo(d_alloc, stream);
raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&graph_coo, &comp_coo,
d_alloc, stream);

ML::PUSH_RANGE("umap::optimization");
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/umap/simpl_set_embed/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include <string>
#include "optimize_batch_kernel.cuh"

#include <sparse/op/filter.cuh>

#pragma once

namespace UMAPAlgo {
Expand Down Expand Up @@ -194,7 +196,7 @@ void optimize_layout(T *head_embedding, int head_n, T *tail_embedding,
* and their 1-skeletons.
*/
template <int TPB_X, typename T>
void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
void launcher(int m, int n, raft::sparse::COO<T> *in, UMAPParams *params,
T *embedding, std::shared_ptr<deviceAllocator> d_alloc,
cudaStream_t stream) {
int nnz = in->nnz;
Expand Down Expand Up @@ -228,8 +230,8 @@ void launcher(int m, int n, MLCommon::Sparse::COO<T> *in, UMAPParams *params,
},
stream);

MLCommon::Sparse::COO<T> out(d_alloc, stream);
MLCommon::Sparse::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);
raft::sparse::COO<T> out(d_alloc, stream);
raft::sparse::op::coo_remove_zeros<TPB_X, T>(in, &out, d_alloc, stream);

MLCommon::device_buffer<T> epochs_per_sample(d_alloc, stream, out.nnz);
CUDA_CHECK(
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/umap/simpl_set_embed/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace SimplSetEmbed {
using namespace ML;

template <int TPB_X, typename T>
void run(int m, int n, MLCommon::Sparse::COO<T> *coo, UMAPParams *params,
void run(int m, int n, raft::sparse::COO<T> *coo, UMAPParams *params,
T *embedding, std::shared_ptr<deviceAllocator> alloc,
cudaStream_t stream, int algorithm = 0) {
switch (algorithm) {
Expand Down
Loading

0 comments on commit d72c54a

Please sign in to comment.