Merge branch 'branch-23.06' into parquet_level_optimization

rapidsai · May 15, 2023 · 8bbbab1 · 8bbbab1
2 parents 8804007 + 4fe3e38
commit 8bbbab1
Show file tree

Hide file tree

Showing 70 changed files with 1,125 additions and 581 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
         hooks:
               - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
-        rev: 'v0.971'
+        rev: 'v1.3.0'
         hooks:
               - id: mypy
                 additional_dependencies: [types-cachetools]

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -31,8 +31,8 @@ dependencies:
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0.*
-- gtest>=1.13.0.*
+- gmock>=1.13.0
+- gtest>=1.13.0
 - hypothesis
 - ipython
 - libarrow==11.0.0.*

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -396,6 +396,8 @@ outputs:
         - {{ pin_subpackage('libcudf', exact=True) }}
         - {{ pin_subpackage('libcudf_kafka', exact=True) }}
         - cudatoolkit {{ cuda_spec }}
+        - gtest {{ gtest_version }}
+        - gmock {{ gtest_version }}
         - libcurand {{ libcurand_run_version }}
     about:
       home: https://rapids.ai/

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -153,8 +153,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
-  STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
-  stream_compaction/unique_count.cpp
+  STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp
+  stream_compaction/unique.cpp stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
@@ -195,8 +195,7 @@ ConfigureBench(
   reduction/reduce.cpp reduction/scan.cpp
 )
 ConfigureNVBench(
-  REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/scan_structs.cpp
-  reduction/segmented_reduce.cpp
+  REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
@@ -280,7 +279,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/case.cpp
   string/combine.cpp
   string/contains.cpp
   string/convert_datetime.cpp
@@ -301,7 +299,10 @@ ConfigureBench(
   string/url_decode.cu
 )
 
-ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)
+ConfigureNVBench(
+  STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp
+  string/reverse.cpp
+)
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/reduction/distinct_count.cpp → ...arks/stream_compaction/distinct_count.cpp b/cpp/benchmarks/reduction/distinct_count.cpp → ...arks/stream_compaction/distinct_count.cpp
@@ -15,14 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/stream_compaction.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
-static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_list<Type>)
+static void bench_distinct_count(nvbench::state& state, nvbench::type_list<Type>)
 {
   auto const dtype            = cudf::type_to_id<Type>();
   auto const size             = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -40,16 +40,19 @@ static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_
   auto const& data_column = data_table->get_column(0);
   auto const input_table  = cudf::table_view{{data_column, data_column, data_column}};
 
+  auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    rmm::cuda_stream_view stream{launch.get_stream()};
-    cudf::detail::distinct_count(input_table, cudf::null_equality::EQUAL, stream);
+    cudf::distinct_count(input_table, cudf::null_equality::EQUAL);
   });
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 using data_type = nvbench::type_list<int32_t, int64_t, float, double>;
 
-NVBENCH_BENCH_TYPES(bench_reduction_distinct_count, NVBENCH_TYPE_AXES(data_type))
-  .set_name("reduction_distinct_count")
+NVBENCH_BENCH_TYPES(bench_distinct_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("distinct_count")
   .add_int64_axis("num_rows",
                   {
                     10000,      // 10k

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
@@ -15,36 +15,64 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCase : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_case(benchmark::State& state)
+void bench_case(nvbench::state& state)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows});
-  cudf::strings_column_view input(column->view());
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("width"));
+  auto const encoding  = state.get_string("encoding");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::to_lower(input);
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+
+  auto col_view = column->view();
+
+  cudf::column::contents ascii_contents;
+  if (encoding == "ascii") {
+    data_profile ascii_profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
+    auto input = cudf::strings_column_view(col_view);
+    auto ascii_column =
+      create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
+    auto ascii_data = ascii_column->view();
 
-#define SORT_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCase, name)       \
-  (::benchmark::State & st) { BM_case(st); } \
-  BENCHMARK_REGISTER_F(StringCase, name)     \
-    ->RangeMultiplier(8)                     \
-    ->Ranges({{1 << 12, 1 << 24}})           \
-    ->UseManualTime()                        \
-    ->Unit(benchmark::kMillisecond);
+    col_view = cudf::column_view(col_view.type(),
+                                 col_view.size(),
+                                 nullptr,
+                                 col_view.null_mask(),
+                                 col_view.null_count(),
+                                 0,
+                                 {input.offsets(), ascii_data});
+
+    ascii_contents = ascii_column->release();
+  }
+  auto input = cudf::strings_column_view(col_view);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  state.add_element_count(input.chars_size(), "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
+  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
+}
 
-SORT_BENCHMARK_DEFINE(to_lower)
+NVBENCH_BENCH(bench_case)
+  .set_name("strings_case")
+  .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_char_types(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const api_type  = state.get_string("api");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto input_types = cudf::strings::string_character_types::SPACE;
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  if (api_type == "all") {
+    state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+  }
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (api_type == "all") {
+      auto result = cudf::strings::all_characters_of_type(input, input_types);
+    } else {
+      auto result = cudf::strings::filter_characters_of_type(input, input_types);
+    }
+  });
+}
+
+NVBENCH_BENCH(bench_char_types)
+  .set_name("char_types")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -12,6 +12,11 @@
           "file" : "nvbench/use_existing_fmt.diff",
           "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "nvbench/public_fmt_dep_in_conda.diff",
+          "issue" : "Propagate fmt requirement in conda envs [https://github.com/NVIDIA/nvbench/pull/127]",
+          "fixed_in" : ""
         }
       ]
     }

diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
@@ -228,6 +228,9 @@ std::unique_ptr<column> empty_like(scalar const& input);
  *
  * Supports only fixed-width types.
  *
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
  * @param[in] input Immutable view of input column to emulate
  * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
@@ -244,6 +247,9 @@ std::unique_ptr<column> allocate_like(
  *
  * Supports only fixed-width types.
  *
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
  * @param[in] input Immutable view of input column to emulate
  * @param[in] size The desired number of elements that the new column should have capacity for
  * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN

diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -67,7 +67,7 @@ class metadata_builder {
    * @brief Destructor that will be implemented as default, required because metadata_builder_impl
    * is incomplete at this stage.
    */
-  ~metadata_builder() = default;
+  ~metadata_builder();
 
   /**
    * @brief Add a column to this metadata builder.
@@ -105,9 +105,23 @@ class metadata_builder {
    */
   std::vector<uint8_t> build() const;
 
+  /**
+   * @brief Clear the internal buffer containing all added metadata.
+   */
+  void clear();
+
  private:
   std::unique_ptr<metadata_builder_impl> impl;
 };
 
+/**
+ * @copydoc pack_metadata
+ * @param builder The reusable builder object to create packed column metadata.
+ */
+std::vector<uint8_t> pack_metadata(table_view const& table,
+                                   uint8_t const* contiguous_buffer,
+                                   size_t buffer_size,
+                                   metadata_builder& builder);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
@@ -36,6 +36,8 @@ namespace cudf {
  * @brief Returns the null count for a null mask of the specified `state`
  * representing `size` elements.
  *
+ * @throw std::invalid_argument if state is UNINITIALIZED
+ *
  * @param state The state of the null mask
  * @param size The number of elements represented by the mask
  * @return The count of null elements

diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -121,15 +121,17 @@ template <typename Iter>
  * The returned iterator yields `false` (to mark `null`) at the indices corresponding to the
  * pointers having `nullptr` values and `true` for the remaining indices.
  *
+ * @note The input vector is referenced by the transform iterator, so the
+ * lifespan must be just as long as the iterator.
+ *
  * @tparam T the data type
  * @param ptrs The data pointers for which the validity iterator is computed
  * @return auto Validity iterator
  */
 template <class T>
 [[maybe_unused]] static auto nulls_from_nullptrs(std::vector<T const*> const& ptrs)
 {
-  // The vector `indices` is copied into the lambda as it can be destroyed at the caller site.
-  return thrust::make_transform_iterator(ptrs.begin(), [ptrs](auto ptr) { return ptr != nullptr; });
+  return thrust::make_transform_iterator(ptrs.begin(), [](auto ptr) { return ptr != nullptr; });
 }
 
 }  // namespace iterators

diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,11 +84,13 @@ std::enable_if_t<cudf::is_fixed_width<TypeParam>() && !cudf::is_timestamp_t<Type
                  thrust::host_vector<TypeParam>>
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
-  thrust::host_vector<TypeParam> vec(init_list.size());
-  std::transform(std::cbegin(init_list), std::cend(init_list), std::begin(vec), [](auto const& e) {
-    if constexpr (std::is_unsigned_v<TypeParam>) { return static_cast<TypeParam>(std::abs(e)); }
-    return static_cast<TypeParam>(e);
-  });
+  std::vector<T> input{init_list};
+  std::vector<TypeParam> vec(init_list.size());
+  std::transform(
+    std::cbegin(input), std::cend(input), std::begin(vec), [](auto const& e) -> TypeParam {
+      if constexpr (std::is_unsigned_v<TypeParam>) { return static_cast<TypeParam>(std::abs(e)); }
+      return static_cast<TypeParam>(e);
+    });
   return vec;
 }