Refactor strings column factories (#7397)

This PR refactors strings column factories to eliminate the use of `device_vector` and `std::vector` parameters, and to facility more use of `device_uvector` in calls to the factories. This is a small part of #7287 . Multiple versions of `make_strings_columns` take `device_vector` parameters. This PR expands the use of iterator and `device_span` versions to enable switching to `device_uvector` as described in #7287. It also adds new `make_device_uvector_async/sync` utility functions. This will help facilitate safe CUDA stream usage. Authors: - Mark Harris (@harrism) Approvers: - AJ Schmidt (@ajschmidt8) - Jake Hemstad (@jrhemstad) - David (@davidwendt) - Christopher Harris (@cwharris) URL: #7397
rapidsai · Mar 4, 2021 · e5d0ec9 · e5d0ec9
1 parent 4d0c160
commit e5d0ec9
Show file tree

Hide file tree

Showing 40 changed files with 564 additions and 404 deletions.
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -96,6 +96,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp
     - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
     - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
+    - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_vector.hpp>
 
 #include <future>
 #include <memory>
@@ -411,7 +412,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
       row += std::max(run_len - 1, 0);
     }
   }
-  return cudf::make_strings_column(out_col.chars, out_col.offsets, out_col.null_mask);
+
+  rmm::device_vector<char> d_chars(out_col.chars);
+  rmm::device_vector<cudf::size_type> d_offsets(out_col.offsets);
+  rmm::device_vector<cudf::bitmask_type> d_null_mask(out_col.null_mask);
+  return cudf::make_strings_column(d_chars, d_offsets, d_null_mask);
 }
 
 template <>

diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 
 #include <benchmark/benchmark.h>
 
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/sequence.h>

diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/include/cudf/detail/utilities/trie.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 #include <cuda_runtime.h>
 #include <thrust/host_vector.h>
 
-using cudf::detail::device_span;
+using cudf::device_span;
 
 static constexpr char trie_terminating_character = '\n';
 

diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @brief Convenience factories for creating device vectors from host spans
+ * @file vector_factories.hpp
+ */
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
+ * `host_span`
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The host_span of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_async(
+  host_span<T const> source_data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(source_data.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                           source_data.data(),
+                           source_data.size() * sizeof(T),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  return ret;
+}
+
+/**
+ * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a host
+ * container
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input host container from which to copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename Container,
+          std::enable_if_t<
+            std::is_convertible<Container,
+                                host_span<typename Container::value_type const>>::value>* = nullptr>
+rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
+  Container const& c,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
+}
+
+/**
+ * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
+ * `device_span`
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device_span of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_async(
+  device_span<T const> source_data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(source_data.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                           source_data.data(),
+                           source_data.size() * sizeof(T),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  return ret;
+}
+
+/**
+ * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a device
+ * container
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
+  Container const& c,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return make_device_uvector_async(
+    device_span<typename Container::value_type const>{c}, stream, mr);
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
+ * `host_span`
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The host_span of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_sync(
+  host_span<T const> source_data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto ret = make_device_uvector_async(source_data, stream, mr);
+  stream.synchronize();
+  return ret;
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a host
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input host container from which to copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename Container,
+          std::enable_if_t<
+            std::is_convertible<Container,
+                                host_span<typename Container::value_type const>>::value>* = nullptr>
+rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
+  Container const& c,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
+ * `device_span`
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device_span of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_sync(
+  device_span<T const> source_data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto ret = make_device_uvector_async(source_data, stream, mr);
+  stream.synchronize();
+  return ret;
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a device
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
+  Container const& c,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
@@ -112,8 +112,7 @@ std::unique_ptr<cudf::column> gather(
   auto const d_out_chars = out_chars_column->mutable_view().template data<char>();
 
   // fill in chars
-  cudf::detail::device_span<int32_t const> const d_out_offsets_span(d_out_offsets,
-                                                                    output_count + 1);
+  cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
   auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
   auto gather_chars_fn =
     [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {