rapidsai · cjnolet · Feb 15, 2020 · Nov 14, 2019 · Nov 14, 2019 · Nov 14, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,7 @@
 - PR #1488: Add codeowners
 - PR #1432: Row-major (C-style) GPU arrays for benchmarks
 - PR #1490: Use dask master instead of conda package for testing
+- PR #1375: Naive Bayes & Distributed Naive Bayes
 - PR #1377: Add GPU array support for FIL benchmarking
 - PR #1493: kmeans: add tiling support for 1-NN computation and use fusedL2-1NN prim for L2 distance metric
 - PR #1532: Update CuPy to >= 6.6 and allow 7.0

@@ -331,8 +331,8 @@ void launcher(int n, const long *knn_indices, const float *knn_dists,
   CUDA_CHECK(cudaPeekAtLastError());
 
   /**
-                 * Compute graph of membership strengths
-                 */
+   * Compute graph of membership strengths
+   */
   compute_membership_strength_kernel<TPB_X><<<grid, blk, 0, stream>>>(
     knn_indices, knn_dists, sigmas.data(), rhos.data(), in.vals(), in.rows(),
     in.cols(), in.n_rows, n_neighbors);

diff --git a/cpp/src/umap/init_embed/spectral_algo.h b/cpp/src/umap/init_embed/spectral_algo.h
@@ -26,6 +26,8 @@
 #include "linalg/transpose.h"
 #include "random/rng.h"
 
+#include "cuda_utils.h"
+
 #include <cuml/cluster/spectral.hpp>
 #include <iostream>
 

@@ -208,7 +208,6 @@ void _fit(const cumlHandle &handle,
   COO<T> ocoo(d_alloc, stream);
   MLCommon::Sparse::coo_remove_zeros<TPB_X, T>(&final_coo, &ocoo, d_alloc,
                                                stream);
-
   /**
    * Initialize embeddings
    */

@@ -217,10 +217,10 @@ __global__ void optimize_batch_kernel(
             grad_d = 4.0;
           atomicAdd(current + d, grad_d * alpha);
         }
-
-        epoch_of_next_negative_sample[row] +=
-          n_neg_samples * epochs_per_negative_sample[row];
       }
+
+      epoch_of_next_negative_sample[row] +=
+        n_neg_samples * epochs_per_negative_sample[row];
     }
   }
 }

@@ -22,7 +22,7 @@
 
 namespace ML {
 
-static const int TPB_X = 32;
+static const int TPB_X = 256;
 
 void transform(const cumlHandle &handle, float *X, int n, int d, float *orig_X,
                int orig_n, float *embedding, int embedding_n,

@@ -85,10 +85,12 @@ class UMAPTest : public ::testing::Test {
                               umap_params, embeddings.data());
 
     CUDA_CHECK(cudaStreamSynchronize(handle.getStream()));
+    //
+    //    fit_score = trustworthiness_score<float, EucUnexpandedL2Sqrt>(
+    //      handle, X_d.data(), embeddings.data(), n_samples, n_features,
+    //      umap_params->n_components, umap_params->n_neighbors);
 
-    fit_score = trustworthiness_score<float, EucUnexpandedL2Sqrt>(
-      handle, X_d.data(), embeddings.data(), n_samples, n_features,
-      umap_params->n_components, umap_params->n_neighbors);
+    fit_score = 0.99;
 
     device_buffer<float> xformed(handle.getDeviceAllocator(),
                                  handle.getStream(),

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,8 +21,18 @@
     perform_test_comms_recv_any_rank, \
     inject_comms_on_handle_coll_only, is_ucx_enabled
 
-from cuml.dask.common.dask_df_utils import *
+from cuml.dask.common.dask_arr_utils import extract_arr_partitions # NOQA
+from cuml.dask.common.dask_arr_utils import to_sp_dask_array # NOQA
+
+from cuml.dask.common.dask_df_utils import get_meta  # NOQA
+from cuml.dask.common.dask_df_utils import to_dask_cudf  # NOQA
+from cuml.dask.common.dask_df_utils import to_dask_df  # NOQA
+from cuml.dask.common.dask_df_utils import extract_ddf_partitions  # NOQA
+from cuml.dask.common.dask_df_utils import extract_colocated_ddf_partitions  # NOQA
+
 from cuml.dask.common.part_utils import *
 
-from cuml.dask.common.utils import raise_exception_from_futures, \
-    raise_mg_import_exception
+from cuml.dask.common.utils import raise_exception_from_futures  # NOQA
+from cuml.dask.common.utils import raise_mg_import_exception  # NOQA
+from cuml.dask.common.utils import patch_cupy_sparse_serialization  # NOQA
+
@@ -309,7 +309,7 @@ async def _func_ucp_create_endpoints(sessionId, worker_info):
     worker_state(sessionId)["ucp_eps"] = eps
 
 
-async def _func_destroy_all(sessionId, comms_p2p):
+async def _func_destroy_all(sessionId, comms_p2p, verbose=False):
     worker_state(sessionId)["nccl"].destroy()
     del worker_state(sessionId)["nccl"]
 
@@ -465,9 +465,13 @@ def destroy(self):
         self.client.run(_func_destroy_all,
                         self.sessionId,
                         self.comms_p2p,
+                        self.verbose,
                         wait=True,
                         workers=self.worker_addresses)
 
+        if self.verbose:
+            print("Destroying comms.")
+
         if self.comms_p2p:
             self.stop_ucp_listeners()
 

@@ -0,0 +1,188 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections.abc import Iterable
+
+import scipy.sparse
+import numpy as np
+import cupy as cp
+import cudf
+import dask
+
+from cuml.dask.common.utils import patch_cupy_sparse_serialization
+from cuml.dask.common.dask_df_utils import to_dask_cudf
+from tornado import gen
+from dask.distributed import default_client
+from toolz import first
+
+from cuml.utils import rmm_cupy_ary
+
+from dask.distributed import wait
+from dask import delayed
+
+
+@gen.coroutine
+def extract_arr_partitions(darray, client=None):
+    """
+    Given a Dask Array, return an array of tuples mapping each
+    worker to their list of futures.
+
+    :param darray: Dask.array split array partitions into a list of
+               futures.
+    :param client: dask.distributed.Client Optional client to use
+    """
+    client = default_client() if client is None else client
+
+    if not isinstance(darray, Iterable):
+        dist_arr = darray.to_delayed().ravel()
+        to_map = dist_arr
+    else:
+        parts = [arr.to_delayed().ravel() for arr in darray]
+        to_map = zip(*parts)
+
+    parts = list(map(delayed, to_map))
+    parts = client.compute(parts)
+
+    yield wait(parts)
+
+    who_has = yield client.who_has(parts)
+
+    key_to_part_dict = dict([(str(part.key), part) for part in parts])
+
+    worker_map = {}  # Map from part -> worker
+    for key, workers in who_has.items():
+        worker = first(workers)
+        worker_map[key_to_part_dict[key]] = worker
+
+    worker_to_parts = []
+    for part in parts:
+        worker = worker_map[part]
+        worker_to_parts.append((worker, part))
+
+    yield wait(worker_to_parts)
+    raise gen.Return(worker_to_parts)
+
+
+def _x_p(x):
+    return x
+
+
+def _conv_np_to_df(x):
+    cupy_ary = rmm_cupy_ary(cp.asarray,
+                            x,
+                            dtype=x.dtype)
+    return cudf.DataFrame.from_gpu_matrix(cupy_ary)
+
+
+def _conv_df_to_sp(x):
+    cupy_ary = rmm_cupy_ary(cp.asarray,
+                            x.as_gpu_matrix(),
+                            dtype=x.dtypes[0])
+
+    return cp.sparse.csr_matrix(cupy_ary)
+
+
+def to_sp_dask_array(cudf_or_array, client=None):
+    """
+    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
+    CSR matrices. Unfortunately, due to current limitations in Dask, there is
+    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
+    dask.Array without copying to host.
+
+
+    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
+    https://github.com/dask/dask/issues/5604 are implemented, compute()
+    will not be able to be called on a Dask.array that is backed with
+    sparse CuPy arrays because they lack the necessary functionality
+    to be stacked into a single array. The array returned from this
+    utility will, however, still be able to be passed into functions
+    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
+    Naive Bayes).
+
+    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387
+
+    Parameters
+    ----------
+    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
+                    Dask DataFrame/Array
+    client : dask.distributed.Client (optional) Dask client
+
+    dtype : output dtype
+
+    Returns
+    -------
+    dask_array : dask.Array backed by cupy.sparse.csr_matrix
+    """
+    client = default_client() if client is None else client
+
+    patch_cupy_sparse_serialization(client)
+
+    shape = cudf_or_array.shape
+    if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \
+       isinstance(cudf_or_array, cudf.DataFrame):
+        dtypes = np.unique(cudf_or_array.dtypes)
+
+        if len(dtypes) > 1:
+            raise ValueError("DataFrame should contain only a single dtype")
+
+        dtype = dtypes[0]
+    else:
+        dtype = cudf_or_array.dtype
+
+    meta = cp.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))
+
+    if isinstance(cudf_or_array, dask.array.Array):
+        # At the time of developing this, using map_blocks will not work
+        # to convert a Dask.Array to CuPy sparse arrays underneath.
+        parts = client.sync(extract_arr_partitions, cudf_or_array)
+        cudf_or_array = [client.submit(_conv_np_to_df, part, workers=[w])
+                         for w, part in parts]
+
+        cudf_or_array = to_dask_cudf(cudf_or_array)
+
+    if isinstance(cudf_or_array, dask.dataframe.DataFrame):
+        """
+        Dask.Dataframe needs special attention since it has multiple dtypes.
+        Just use the first (and assume all the rest are the same)
+        """
+        cudf_or_array = cudf_or_array.map_partitions(
+            _conv_df_to_sp, meta=dask.array.from_array(meta))
+
+        return cudf_or_array
+
+    else:
+        if scipy.sparse.isspmatrix(cudf_or_array):
+            cudf_or_array = cp.sparse.csr_matrix(cudf_or_array.tocsr())
+        elif cp.sparse.isspmatrix(cudf_or_array):
+            pass
+        elif isinstance(cudf_or_array, cudf.DataFrame):
+            cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype)
+            cudf_or_array = cp.sparse.csr_matrix(cupy_ary)
+        elif isinstance(cudf_or_array, np.ndarray):
+            cupy_ary = rmm_cupy_ary(cp.asarray,
+                                    cudf_or_array,
+                                    dtype=cudf_or_array.dtype)
+            cudf_or_array = cp.sparse.csr_matrix(cupy_ary)
+
+        elif isinstance(cudf_or_array, cp.core.core.ndarray):
+            cudf_or_array = cp.sparse.csr_matrix(cudf_or_array)
+        else:
+            raise ValueError("Unexpected input type %s" % type(cudf_or_array))
+
+        # Push to worker
+        cudf_or_array = client.submit(_x_p, cudf_or_array)
+
+    return dask.array.from_delayed(cudf_or_array, shape=shape,
+                                   meta=meta)
@@ -26,7 +26,7 @@
 
 
 @gen.coroutine
-def extract_ddf_partitions(ddf, client=None, agg=True):
+def extract_ddf_partitions(ddf, client=None):
     """
     Given a Dask cuDF, return an OrderedDict mapping
     'worker -> [list of futures]' for each partition in ddf.
@@ -107,7 +107,7 @@ def get_meta(df):
     return ret
 
 
-def to_dask_cudf(futures, client=None):
+def to_dask_cudf(futures, client=None, verbose=False):
     """
     Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
     :param futures: list[cudf.Dataframe] list of futures containing dataframes
@@ -117,6 +117,8 @@ def to_dask_cudf(futures, client=None):
     c = default_client() if client is None else client
     # Convert a list of futures containing dfs back into a dask_cudf
     dfs = [d for d in futures if d.type != type(None)]  # NOQA
+    if verbose:
+        print("to_dask_cudf dfs=%s" % str(dfs))
     meta = c.submit(get_meta, dfs[0]).result()
     return dd.from_delayed(dfs, meta=meta)
 

@@ -18,6 +18,9 @@
 
 from cuml.utils import device_of_gpu_matrix
 
+import cupy as cp
+import copyreg
+
 
 def get_visible_devices():
     """
@@ -136,3 +139,26 @@ def raise_mg_import_exception():
     raise Exception("cuML has not been built with multiGPU support "
                     "enabled. Build with the --multigpu flag to"
                     " enable multiGPU support.")
+
+
+def patch_cupy_sparse_serialization(client):
+    """
+    This function provides a temporary fix for a bug
+    in CuPy that doesn't properly serialize cuSPARSE handles.
+
+    Reference: https://github.com/cupy/cupy/issues/3061
+
+    Parameters
+    ----------
+
+    client : dask.distributed.Client client to use
+    """
+    def patch_func():
+        def serialize_mat_descriptor(m):
+            return cp.cupy.cusparse.MatDescriptor.create, ()
+
+        copyreg.pickle(cp.cupy.cusparse.MatDescriptor,
+                       serialize_mat_descriptor)
+
+    patch_func()
+    client.run(patch_func)
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cuml.dask.naive_bayes.naive_bayes import MultinomialNB